# NLP and Feature Engineering

In [8]:
from bs4 import BeautifulSoup
from textblob import TextBlob
import pandas as pd
import numpy as np

In [9]:
# read in data from API and Data Gathering
# read in without creating nulls for empty cells, this is important when using beautifulsoup, it won't run on cells which contain a null
df = pd.read_csv('./data/python.csv', keep_default_na=False)

In [10]:
# remind us what the data looks like
df.head()

Unnamed: 0,python,selftext,title
0,1,"i want to open multiple ssh session, one to my...",paramiko multiple ssh session
1,1,,"[100% off] decision trees, random forests, ada..."
2,1,i have to write a script for 10 children in cl...,need some help from the smart people .
3,1,,i made tinder bot written in python selenium
4,1,streamlit: [https://www.streamlit.io/](https:/...,is it possible to host a streamlit app on vercel?


In [11]:
# I'm sure there is an easier way to do this, but i'm blanking on how to create empty columns
# these are empty (placeholder) columns of new dataframe. This could have been done by creating series and concat.
# but I had already started this way.
df['self_pol'] = np.nan
df['self_sub'] = np.nan
df['title_pol'] = np.nan
df['title_sub'] = np.nan
df['title_words'] = np.nan
df['self_words'] = np.nan
df['words'] = np.nan
df['sentences'] = np.nan
df['text_pol'] = np.nan
df['text_sub'] = np.nan

In [12]:
df.head()

Unnamed: 0,python,selftext,title,self_pol,self_sub,title_pol,title_sub,title_words,self_words,words,sentences,text_pol,text_sub
0,1,"i want to open multiple ssh session, one to my...",paramiko multiple ssh session,,,,,,,,,,
1,1,,"[100% off] decision trees, random forests, ada...",,,,,,,,,,
2,1,i have to write a script for 10 children in cl...,need some help from the smart people .,,,,,,,,,,
3,1,,i made tinder bot written in python selenium,,,,,,,,,,
4,1,streamlit: [https://www.streamlit.io/](https:/...,is it possible to host a streamlit app on vercel?,,,,,,,,,,


In [13]:
# getting rid of all the links. this is somewhat easy because it follows a pattern.
df.selftext = df.selftext.str.replace(r'\Shttp\S+',"")
df.title = df.title.str.replace(r'\Shttp\S+',"")

# Data Cleaning/Feature Engineering

In [14]:
# creating column 'sentences' by using the TextBlob.sentences methond and counting them
# running selftext and title through Beautiful soup to remove html tags missed in cleaning 
for i in range(len(df)):
    df['sentences'].at[i] = len(TextBlob(df.selftext[i]).sentences)

    df.selftext.at[i] = BeautifulSoup(df.selftext.at[i]).get_text()
    df.title.at[i] = BeautifulSoup(df.title.at[i]).get_text()



># Looks like I missed some links, but no time to fix at the moment

In [15]:
# removing non-letters and replacing with a space. This usually removes punctuation, so it is important to put a space to keep words separated.
df.selftext = df.selftext.str.replace(r'[^a-zA-Z]'," ")

df.title = df.title.str.replace(r'[^a-zA-Z]'," ")

In [16]:
# creating new columns that contain the sentiment analysis of selftext and title.
for i in range(len(df)):
    df['self_pol'].at[i], df['self_sub'].at[i] = TextBlob(df.selftext[i]).sentiment
    df['title_pol'].at[i], df['title_sub'].at[i] = TextBlob(df.title[i]).sentiment
# creating new columns which contain all text and sentiment analysis
    df['text'] = df.selftext + df.title
    df['text_pol'].at[i], df['text_sub'].at[i] = TextBlob(df.text[i]).sentiment
# creating new column with counts of words in selftext, title, and total words
    df['title_words'].at[i] = len(TextBlob(df.title[i]).words)
    df['self_words'].at[i] = len(TextBlob(df.selftext[i]).words)
    df['words'] = df.title_words + df.self_words

In [20]:
# check out results
df.head()

Unnamed: 0,python,selftext,title,self_pol,self_sub,title_pol,title_sub,title_words,self_words,words,sentences,text_pol,text_sub,text
0,1,i want to open multiple ssh session one to my...,paramiko multiple ssh session,0.0,0.25,0.0,0.0,4.0,29.0,33.0,3.0,0.0,0.166667,i want to open multiple ssh session one to my...
1,1,,off decision trees random forests ada...,0.0,0.0,-0.5,0.5,9.0,0.0,9.0,0.0,-0.5,0.5,off decision trees random forests ada...
2,1,i have to write a script for children in cl...,need some help from the smart people,0.091667,0.338889,0.214286,0.642857,7.0,86.0,93.0,9.0,0.038095,0.303175,i have to write a script for children in cl...
3,1,,i made tinder bot written in python selenium,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,0.0,i made tinder bot written in python selenium
4,1,streamlit vercel i can t figure out how ...,is it possible to host a streamlit app on vercel,0.0,0.0,0.0,1.0,10.0,27.0,37.0,1.0,0.0,1.0,streamlit vercel i can t figure out how ...


In [21]:
# save to use in modeling
df.to_csv('./data/python1.csv', index = False)