In [49]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re
from num2words import num2words
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt

In [3]:
corpus = ['Apple Orange Orange Apple','Apple Banana Apple Banana','Banana Apple Banana Banana Banana Apple',\
          'Banana Orange Banana Banana Orange Banana','Banana Apple Banana Banana Orange Banana']

In [4]:
''' Create a function to vectorize the counts of a list of sentences. Does not ignore case.'''
def myCountVec(corpus):
    bag = set()  # use a set to figure out how many unique words there are
    word_index = dict() # use a dict to store the column index of the word
    for line in corpus:
        for word in line.split():
            if word in bag:
                continue
            else:
                bag.add(word)
    list_of_words = sorted(list(bag)) # sort the list of words derived from set
    for index,word in enumerate(list_of_words):
        word_index[word] = index  # assign index to word
    vec_of_counts = np.zeros((len(corpus),len(word_index))) # create dummy matrix with all locations initilised to 0
    for row,line in enumerate(corpus):
        for word in line.split():
            col = word_index[word]
            vec_of_counts[row][col] += 1  # populate the matrix, increasing a word count when needed.
    return vec_of_counts

In [5]:
myCountVec(corpus)

array([[2., 0., 2.],
       [2., 2., 0.],
       [2., 4., 0.],
       [0., 4., 2.],
       [1., 4., 1.]])

In [18]:
df = pd.read_csv('MrTrumpSpeeches.csv', sep='~', encoding='latin1')
df['sentiment'] = np.where(df['like_count'] > df['dislike_count'], 1, 0)
df.head(100)

Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1
5,0BxNhsYYDVg,Donald Trump Speeches & Events,20161006,"Full Event: Donald Trump Town Hall in Sandown,...",48636.0,4.538616,882.0,115.0,ladies and gentlemen please welcome radio...,1
6,0RcHzjQX9ug,Donald Trump Speeches & Events,20161106,Full Speech: Donald Trump Rally in Minneapolis...,22240.0,4.402715,376.0,66.0,[Music] [Music] [Music] hey [Music]...,1
7,23Re3yMeIBI,Donald Trump Speeches & Events,20161030,"FULL EVENT: Donald Trump Rally in Las Vegas, N...",44948.0,4.875346,1399.0,45.0,lengthen our military and take care of ou...,1
8,2EfKaciIVPE,Donald Trump Speeches & Events,20160725,Full Speech: Donald Trump Town Hall in Roanoke...,24817.0,4.535088,403.0,53.0,I give you the man that we must ensure is...,1
9,2m5T7Vlo0-A,Donald Trump Speeches & Events,20161005,"Full Speech: Donald Trump Rally in Henderson, ...",26467.0,4.712526,452.0,35.0,[Music] this is something thank you Wow...,1


In [25]:
# lets check some characteristics of the datframe
df.shape

(836, 10)

In [26]:
# ok so 836 rows by 10 columns. Lets check the numeric columns for issues
df.describe()

Unnamed: 0,upload_date,view_count,average_rating,like_count,dislike_count,sentiment
count,836.0,836.0,836.0,836.0,836.0,836.0
mean,20165260.0,24782.65,4.345376,288.478469,56.348086,0.912679
std,4783.116,60209.21,0.702294,498.063763,188.722867,0.282473
min,20160200.0,71.0,1.347826,2.0,0.0,0.0
25%,20160810.0,1788.0,4.263935,29.0,4.0,1.0
50%,20161100.0,8775.5,4.589107,122.0,19.0,1.0
75%,20170310.0,22709.0,4.782065,358.25,47.0,1.0
max,20170800.0,1013054.0,5.0,6143.0,3544.0,1.0


In [28]:
# ok so far so good. Lets check the non numeric columns
for col in ['id','playlist','title','subtitles']:
    print(df[col].describe())

count             836
unique            836
top       j5Zkxo4uvG0
freq                1
Name: id, dtype: object
count                                                   836
unique                                                    5
top       Donald Trump Rally Speech Events Press Confere...
freq                                                    369
Name: playlist, dtype: object
count                                                   836
unique                                                  834
top       FULL SPEECH: President Donald Trump Battle of ...
freq                                                      2
Name: title, dtype: object
count                                                   836
unique                                                  836
top          to you the 45th president of United   State...
freq                                                      1
Name: subtitles, dtype: object


In [38]:
# so looks like we have all the data. Lets check for missing data and nans
df.isnull().any()

id                False
playlist          False
upload_date       False
title             False
view_count        False
average_rating    False
like_count        False
dislike_count     False
subtitles         False
sentiment         False
dtype: bool

In [8]:
# so there are some genral punctuation marks and brackets that should be removed since they dont add anything to the context.
# there also are quite a few words that are numbers. We will try to change these to text using num2words
def cleaningFunc(line):
    line = re.sub('[\/\+\-,:.\'\$*%\[\]\&]','',line) # remove puctuation marks and brackets
    line = line.lower() # convert everthing to lowercase
    words = line.split() # split into words
    newwords = []
    for word in words:
        word = word.strip(' ') # remove any trailing or leading spaces
        newwords.append(word)
    return " ".join(newwords) # return sentence with word seperated by a single space

In [57]:
df['subtitle_clean'] = df['subtitles'].apply(lambda x : cleaningFunc(x))
cvecs = CountVectorizer(stop_words='english')
Xcv = cvecs.fit_transform(df['subtitle_clean'].values).toarray()
tfvecs = TfidfVectorizer(min_df=1,max_df=0.8,stop_words='english',ngram_range=(1,2))
Xtf = tfvecs.fit_transform(df['subtitle_clean'].values).toarray()
tfvecs.vocabulary_

{'presidents': 271169,
 'united': 374415,
 'states': 330933,
 'mr': 231560,
 'trapp': 365939,
 'family': 119580,
 'thank': 349352,
 'applause': 20231,
 'music': 232164,
 'everybody': 113207,
 'want': 383099,
 'begin': 30608,
 'thanking': 350312,
 'south': 325006,
 'carolina': 49581,
 'special': 325722,
 'state': 330131,
 'know': 191440,
 'little': 206820,
 'boost': 38325,
 'week': 389418,
 'place': 260866,
 'remember': 290219,
 'new': 236674,
 'hampshire': 151219,
 'forget': 128834,
 'love': 212745,
 'sent': 313774,
 'good': 141443,
 'feeling': 122207,
 'right': 296632,
 'really': 284570,
 'volunteers': 380515,
 'travel': 365980,
 'say': 306568,
 'doing': 97889,
 'theyre': 352898,
 'going': 138296,
 'texas': 349171,
 'nevada': 236620,
 'ill': 168816,
 'making': 216271,
 'big': 34298,
 'speech': 326158,
 'tomorrow': 362247,
 'atlanta': 24638,
 'think': 355466,
 'terrific': 348325,
 'sec': 310289,
 'exciting': 114704,
 'expect': 115362,
 'easy': 104127,
 'dont': 100007,
 'anymore': 19515