In [1]:
from collections import OrderedDict
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import metrics

In [3]:
df = pd.read_csv('aggregation.csv')
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more despera...,24298,168648,GretaThunberg,17 year old climate and environmental activist...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump w...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for Pr...,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#Wetsuwet...,4609,21488,GretaThunberg,17 year old climate and environmental activist...,True,4086646,2020-02-08 13:36:48,True,False,True
3,Stop running away from your problem. Run into ...,2739,16317,pulte,The Philanthropist. Inventor of Twitter Philan...,True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeli...,2972,10035,GretaThunberg,17 year old climate and environmental activist...,True,4091979,2020-02-18 10:13:02,True,False,True


In [4]:
tweet_corpus = list(df['text'])

In [5]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

for i in range(len(tweet_corpus)):
    tweet_corpus[i] = url_pattern.sub('', tweet_corpus[i])

In [6]:
non_ascii_pattern = re.compile("[^\u0000-\u2300]")

for i in range(len(tweet_corpus)):
    tweet_corpus[i] = non_ascii_pattern.sub('', tweet_corpus[i])

In [7]:
add_stop_words = ['wetsuweten', 'wet', 'suwet', 'en',
                  'wetsuwetenstrong', 'wetsuwetensolidarity', 'shutdowncanada',
                  'bc', 'british', 'columbia', 'canada', 'indigenous', 'pipeline']

custom_stop_words = set(ENGLISH_STOP_WORDS).union(set(add_stop_words))

In [44]:
count_vect = TfidfVectorizer(strip_accents='unicode',
                          analyzer='word',
                          stop_words=custom_stop_words,
                          ngram_range=(1,1))
X = count_vect.fit_transform(tweet_corpus)

In [45]:
terms = count_vect.get_feature_names()
print("1-grams:", len(terms))

1-grams: 8385


In [46]:
count_vect.vocabulary_.get(u'coronavirus')

1744

In [30]:
count_vect.vocabulary_.get(u'trudeau')

7712

In [31]:
count_vect.vocabulary_.get(u'canada')

In [47]:
count_vect.vocabulary_

{'starting': 7076,
 'desperate': 2139,
 'shows': 6780,
 'winning': 8231,
 'believe': 817,
 'defeat': 2032,
 'donald': 2340,
 'trump': 7724,
 'candidate': 1183,
 'like': 4458,
 'joe': 4092,
 'biden': 866,
 'supported': 7269,
 'iraq': 4001,
 'war': 8085,
 'rights': 6412,
 'climate': 1449,
 'justice': 4144,
 'keepitintheground': 4197,
 'stop': 7132,
 'running': 6497,
 'away': 673,
 'problem': 5863,
 'run': 6495,
 'suck': 7222,
 'really': 6141,
 'pop': 5715,
 'support': 7268,
 'nation': 5025,
 'protests': 5947,
 'happening': 3448,
 'wetsuwenstrong': 8170,
 'om': 5258,
 'friday': 3091,
 'strike': 7166,
 'brussels': 1063,
 'central': 1283,
 'station': 7088,
 '14': 22,
 '00': 0,
 'representatives': 6288,
 'european': 2680,
 'nat': 5019,
 'member': 4742,
 'shirley': 6748,
 'wilson': 8221,
 'supports': 7274,
 'coastalgaslink': 1487,
 'don': 2339,
 'agree': 343,
 'becaus': 794,
 'powerful': 5764,
 'rita': 6428,
 'george': 3203,
 'matriarch': 4676,
 'amp': 435,
 'hereditary': 3554,
 'subchief': 7

In [48]:
vocab_dict = count_vect.vocabulary_

In [51]:
list(vocab_dict.items())[:5]

[('starting', 7076),
 ('desperate', 2139),
 ('shows', 6780),
 ('winning', 8231),
 ('believe', 817)]

In [52]:
# vocab_dict = sorted(vocab_dict.items(), key=lambda i:i[1])
vocab_list = sorted((v,k) for (k,v) in vocab_dict.items())

In [53]:
vocab_list[:10]

[(0, '00'),
 (1, '000'),
 (2, '00am'),
 (3, '03'),
 (4, '09'),
 (5, '10'),
 (6, '100'),
 (7, '10000'),
 (8, '101'),
 (9, '1010')]

In [None]:
vocab_list.reverse()

In [None]:
vocab_list[:10]

In [None]:
dict_vec = DictVectorizer()
dict_vec.fit_transform(count_vect.vocabulary_).toarray()

In [None]:
dict_vec.get_feature_names()