# Cleaning and DTM

In [1]:
import pandas as pd
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from many_stop_words import get_stop_words
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
comments = pd.read_csv('comments.csv')
comments.head()

Unnamed: 0,Permalink,Comment
0,1456237437890333,Thank you
1,1456237437890333,Shouldn't we have backup on MeWe in case Faceb...
2,1456237437890333,See this
3,1456237437890333,Re-Open America
4,1456237437890333,Why did one person hit the laugh button on this?


In [3]:
posts = pd.read_csv('posts.csv')
posts.head()

Unnamed: 0,Permalink,Post
0,1462713730576037,CVS on that bs!
1,1462379490609461,I’ve never seen this much unrest in the states...
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...
3,1461150450732365,This is whats wrong with attention seeking mil...
4,1462158127298264,Upset Californian Conservative


In [4]:
# Combining the posts and comments in to one DataFrame 
comments.rename(columns = {'Comment': 'full_text'}, inplace=True)
posts.rename(columns = {'Post':'full_text'}, inplace = True)

all_text = pd.concat([posts, comments], sort = False, axis = 0, ignore_index = True)
all_text.head()

Unnamed: 0,Permalink,full_text
0,1462713730576037,CVS on that bs!
1,1462379490609461,I’ve never seen this much unrest in the states...
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...
3,1461150450732365,This is whats wrong with attention seeking mil...
4,1462158127298264,Upset Californian Conservative


In [5]:
len(all_text)

4525

In [6]:
len(all_text.drop_duplicates(keep=False))

4445

In [7]:
all_text[all_text.duplicated()].head()

Unnamed: 0,Permalink,full_text
119,1462713730576037,Not law
138,1462713730576037,Not a law
698,1461150450732365,Fake
729,1461150450732365,
828,1461150450732365,


In [8]:
all_text.drop_duplicates(keep=False, inplace = True)
all_text.head()

Unnamed: 0,Permalink,full_text
0,1462713730576037,CVS on that bs!
1,1462379490609461,I’ve never seen this much unrest in the states...
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...
3,1461150450732365,This is whats wrong with attention seeking mil...
4,1462158127298264,Upset Californian Conservative


In [9]:
len(all_text[all_text.duplicated(['full_text'])])

73

In [10]:
#This was the description of the page that kept getting aggregated over and over on different post pages
all_text[all_text.duplicated(['full_text'])]

Unnamed: 0,Permalink,full_text
26,?__cft__[0]=AZXDQ5qNWO5x_H6szjyXy-59U3EyBGL0Yx...,CVS on that bs!
161,1462713730576037,We are residents of The great US of A that sta...
562,1462379490609461,We are residents of The great US of A that sta...
581,1462715223909221,We are residents of The great US of A that sta...
876,1461150450732365,We are residents of The great US of A that sta...
...,...,...
4511,1459184227595654,We are residents of The great US of A that sta...
4512,?__cft__[0]=AZXDQ5qNWO5x_H6szjyXy-59U3EyBGL0Yx...,Thank you for the invite. Have a Great Memori...
4513,?__cft__[0]=AZXDQ5qNWO5x_H6szjyXy-59U3EyBGL0Yx...,What this silly meme fails to mention is almos...
4521,?__cft__[0]=AZXDQ5qNWO5x_H6szjyXy-59U3EyBGL0Yx...,We are residents of The great US of A that sta...


In [11]:
all_text.drop_duplicates(subset = 'full_text', keep=False, inplace = True)

In [12]:
all_text.head()

Unnamed: 0,Permalink,full_text
1,1462379490609461,I’ve never seen this much unrest in the states...
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...
3,1461150450732365,This is whats wrong with attention seeking mil...
4,1462158127298264,Upset Californian Conservative
5,1462708023909941,Live Stream. CREW dragon launch.


In [13]:
len(all_text)

4345

In [14]:
len(all_text[all_text['full_text'].isnull()])

0

In [15]:
#Cleaning up the full text
def pre_processing_data(raw_text):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text) #pull only words
    tokenizer = RegexpTokenizer(r'\w+')#tokenize
    text_tokens = tokenizer.tokenize(letters_only.lower()) #make everything lower case
    remove_stopwords = [w for w in text_tokens if w not in stopwords.words('english')] #remove english stopwords 

    return(" ".join(remove_stopwords))

In [16]:
all_text['text'] = [pre_processing_data(string) for string in all_text['full_text']]
all_text.head()

Unnamed: 0,Permalink,full_text,text
1,1462379490609461,I’ve never seen this much unrest in the states...,never seen much unrest states anyone else
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...,attention business owners masked people severi...
3,1461150450732365,This is whats wrong with attention seeking mil...,whats wrong attention seeking millenials young...
4,1462158127298264,Upset Californian Conservative,upset californian conservative
5,1462708023909941,Live Stream. CREW dragon launch.,live stream crew dragon launch


In [17]:
#Visually examining the unique words used in all the posts and comments
all_words = []

for i in all_text.index:
    list_words = all_text.text[i].split(' ')
    for j in list_words:     
        all_words.append(j.strip())
print(len(all_words))
print(len(set(all_words)))

unique_words = set(all_words)

32610
6990


In [18]:
print(sorted(unique_words))



In [19]:
#DTM
cvec = CountVectorizer(stop_words = 'english')
all_text_cvec = cvec.fit_transform(all_text['text'])
#transformaing CountVectorized text in to a data frame
all_text_df = pd.DataFrame(all_text_cvec.todense(), columns = cvec.get_feature_names())
all_text_df.index = all_text.index
all_text_df['Permalink'] = all_text.Permalink
all_text_df.head()



Unnamed: 0,aaa,aaaaaevgncl,aakiydertvy,aaron,abandonment,abbot,abbott,abc,abdus,abhorrence,...,zink,zombie,zombies,zone,zones,zuckerbergs,zux,zuyubaetw,zwf,Permalink
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462379490609461
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462715223909221
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1461150450732365
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462158127298264
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462708023909941


In [20]:
all_text_df.to_pickle("dtm.pkl")

all_text.to_pickle('all_text_clean.pkl')