In [47]:
%matplotlib inline

In [48]:
import sklearn
import numpy as np
import pandas as pd
import re
import html
from functools import reduce

In [49]:
all_data = list()
with open('unclassified_tweets.txt','r') as f:
    for data in f.readlines():
        all_data.append(data.strip())

In [50]:
stop_words = list()
with open('stop_words.txt','r') as f:
    for stop_word in f.readlines():
        stop_words.append(stop_word.strip())

In [51]:
class DataCleaner(object):
    
    def __init__(self,stopWords,removeProcudreInd:int=False):
        super().__init__()
        self.stopwords=set(stopWords) #hash it for O(1) check
        #define the cleaning procedure in sequence in form of list of method pointers
        self.procedures = [self.to_ascii,self.remove_html_tag,self.remove_url\
                 ,self.to_lower,self.remove_all_nonAlphaNumerical_char,self.remove_stop_words]
        if removeProcudreInd is not False:
            self.procedures.pop(removeProcudreInd)
        
    def clean(self,data):
        return reduce(lambda data,func:func(data),[data]+self.procedures)
    
    def remove_html_tag(self,data):
        return re.sub(r"\s*<.*?>",'', data)

    def remove_url(self,data):
        return re.sub(r'\s*(?:https?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+','',data)

    def to_ascii(self,data):
        return html.unescape(data)

    def to_lower(self,data):
        return data.lower()

    def remove_stop_words(self,data):
        words = data.split()
        for ind,word in enumerate(words):
            if word in self.stopwords:
                words[ind]=''
        return ' '.join(filter(lambda x:x, words)) #clean up empty charactor

    def remove_all_nonAlphaNumerical_char(self,data):
        return re.sub(r'[^\s\w]+','',data)
    
    def remove_hash_tag(self,data):
        return re.sub(r'','',data)

In [52]:
thisDataCleaner = DataCleaner(stop_words)

In [53]:
for ind,data in enumerate(all_data):
    all_data[ind] = thisDataCleaner.clean(data)

In [54]:
all_data = np.array(all_data)
all_data_df = pd.DataFrame(all_data.reshape(-1,1),columns=['Twitter'])
all_data_df

Unnamed: 0,Twitter
0,living dream cameraman camera cameraception ca...
1,
2,justin trudeaus reasons thanksgiving todays mo...
3,
4,themadape buttre allergic latex sneeze nbpoli ...
5,
6,2 massive explosions peace march turkey 30 kil...
7,
8,mulcair suggests bad blood trudeau ready4chang...
9,


In [32]:
class Partylabeller(object):
    
    partyDict ={'Liberal': {'lpc', 'liberal', 'liberals', 'realchange', \
                        'justin', 'trudeau', 'trudeaus', 'justintrudeau'},\
            
            'Conservative':{'cpc', 'conservative', 'conservatives', 'stephenharper', \
                              'stephen' 'harper', 'harpers','tcot'},\
            
            'New Democratic':{'ndp', 'npd', 'tom','thomas','tommulcair', 'tommulcairs', 'thomasmulcair',\
                   'readyforchange', 'ready4change', 'mulcair', 'mulcairs'}}
    
    def __init__(self,partyDict:dict=False):
        if partyDict is False:
            partyDict = Partylabeller.partyDict
        #invert the map, target form: {keyword:party}
        self.partyDict = {keyWord: party for party,keyWords in partyDict.items() for keyWord in keyWords}
        
    def label_party(self,data):
        #search for keyword matches, contained in a list form
        return set([self.partyDict.get(word,None) for word in data.split() if self.partyDict.get(word,None)])

In [33]:
thisPartyLabller = Partylabeller()

In [34]:
labels = list()
for ind,data in enumerate(all_data):
    labels.append(thisPartyLabller.label_party(data))