In [135]:
%matplotlib inline

In [156]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import html
import warnings
from collections import OrderedDict
from functools import reduce

In [137]:
#load unclassified data
all_data = list()
with open('unclassified_tweets.txt','r') as f:
    for data in f.readlines():
        all_data.append(data.strip())
        
#conatain in a numpy ndarray for future use 
all_data = np.array(all_data)

In [138]:
#load stop words
stop_words = list()
with open('stop_words.txt','r') as f:
    for stop_word in f.readlines():
        stop_words.append(stop_word.strip())

In [139]:
class DataCleaner(object):
    
    def __init__(self,stopWords,removeProcudreInd:int=False):
        super().__init__()
        self.stopwords=set(stopWords) #hash it for O(1) check
        
        #use reflection to define the cleaning procedures -> form of list of method pointers, sorted by name
        self.procedures = [getattr(self,attr) for attr in \
                           re.findall(r'(run_.*?)(?:\s)',' '.join(sorted(dir(self))))]
        
        #drop the disired procudure if user needs
        if removeProcudreInd is not False:
            self.procedures.pop(removeProcudreInd-1)
            
    #use reduce to sequentially clean the raw data, equivelent to funcN(...func2(func1(data)))
    def clean(self,data):
        return reduce(lambda data,func:func(data),[data]+self.procedures)
    
    def run_01_to_ascii(self,data):
        return html.unescape(data)
    
    def run_02_remove_html_tag(self,data):
        return re.sub(r"\s*<.*?>",'', data)

    def run_03_remove_url(self,data):
        return re.sub(r'\s*(?:https?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+','',data)

    def run_04_to_lower(self,data):
        return data.lower()
    
    def run_05_remove_all_nonAlphaNumerical_char(self,data):
        return re.sub(r'[^\s\w]+','',data)
    
    def run_06_remove_stop_words(self,data):
        words = data.split()
        for ind,word in enumerate(words):
            if word in self.stopwords:
                words[ind]=''
        return ' '.join(filter(lambda x:x, words)) #clean up empty charactor

In [140]:
thisDataCleaner = DataCleaner(stop_words)

In [141]:
#clean all the data in numpy ndarray with vectorized method
all_data = np.vectorize(thisDataCleaner.clean)(all_data)

In [142]:
all_data_df = pd.DataFrame(all_data.reshape(-1,1),columns=['Twitter'])

#show cleaned data in pandas data frame
all_data_df

Unnamed: 0,Twitter
0,living dream cameraman camera cameraception ca...
1,
2,justin trudeaus reasons thanksgiving todays mo...
3,
4,themadape buttre allergic latex sneeze nbpoli ...
5,
6,2 massive explosions peace march turkey 30 kil...
7,
8,mulcair suggests bad blood trudeau ready4chang...
9,


In [143]:
class PartyLabeller(object):
    
    #default party dictionary -> {party:keyword}
    partyDict ={'Liberal': {'lpc', 'liberal', 'liberals', 'realchange', \
                        'justin', 'trudeau', 'trudeaus', 'justintrudeau'},\
            
            'Conservative':{'cpc', 'conservative', 'conservatives', 'stephenharper', \
                              'stephen' 'harper', 'harpers','tcot'},\
            
            'New Democratic':{'ndp', 'npd', 'tom','thomas','tommulcair', 'tommulcairs', 'thomasmulcair',\
                   'readyforchange', 'ready4change', 'mulcair', 'mulcairs'}}
    
    #A static method to manipulate default party dictionary    e.g. delete or add party key words
    @staticmethod
    def partyDictManipulate(candidates:dict,op:str='-',):
        for party in candidates:
            if op is '-':
                rest = PartyLabeller.partyDict.get(party,set())-candidates[party]
                PartyLabeller.partyDict[party] = rest if rest else PartyLabeller.partyDict.pop(party,None)
            elif op is '+':
                PartyLabeller.partyDict[party] = candidates[party] if not PartyLabeller.partyDict.get(party,None) else \
                PartyLabeller.partyDict[party].update(candidates[party])
            else:
                raise Exception('Invalid operation: {}'.format(op))
        warnings.warn("Party dictionary changed, reinitialize existing instance if needed.")
                
    def __init__(self,partyDict:dict=False):
        super().__init__()
        if partyDict is False:
            partyDict = PartyLabeller.partyDict
        #flip (invert) the map, in use for O(1) check -> target form: {keyword:party}
        self.partyDict = {keyWord: party for party,keyWords in partyDict.items() for keyWord in keyWords}
        
    def label_party(self,data):
        #search for keyword matches, contained in a frozenset form
        return frozenset([self.partyDict.get(word,None) for word in data.split() if self.partyDict.get(word,None)])

In [144]:
thisPartyLabller = PartyLabeller()

In [145]:
party_labels = np.vectorize(thisPartyLabller.label_party)(all_data)

In [157]:
all_data_df['Party Labels'] = party_labels
all_data_df

Unnamed: 0,Twitter,Party Labels,Liberal,Conservative,New Democratic
0,living dream cameraman camera cameraception ca...,(New Democratic),0.0,0.0,0.0
1,,(),0.0,0.0,0.0
2,justin trudeaus reasons thanksgiving todays mo...,(Liberal),0.0,0.0,0.0
3,,(),0.0,0.0,0.0
4,themadape buttre allergic latex sneeze nbpoli ...,(),0.0,0.0,0.0
5,,(),0.0,0.0,0.0
6,2 massive explosions peace march turkey 30 kil...,(),0.0,0.0,0.0
7,,(),0.0,0.0,0.0
8,mulcair suggests bad blood trudeau ready4chang...,"(Liberal, New Democratic)",0.0,0.0,0.0
9,,(),0.0,0.0,0.0


In [179]:
partyOD = OrderedDict(PartyLabeller.partyDict)

def labelCounter(party_labels):
    return [1 if key in party_labels else 0 for key in partyOD]

for key in partyOD:
    all_data_df[key]=iter(zip(*all_data_df['Party Labels'].map(labelCounter)))

In [180]:
all_data_df

Unnamed: 0,Twitter,Party Labels,Liberal,Conservative,New Democratic,a,b,c
0,living dream cameraman camera cameraception ca...,(New Democratic),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,1
1,,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
2,justin trudeaus reasons thanksgiving todays mo...,(Liberal),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,1,0,0
3,,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
4,themadape buttre allergic latex sneeze nbpoli ...,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
5,,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
6,2 massive explosions peace march turkey 30 kil...,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
7,,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
8,mulcair suggests bad blood trudeau ready4chang...,"(Liberal, New Democratic)",<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,1,0,1
9,,(),<zip object at 0x7fbc4d0794c8>,<zip object at 0x7fbc556ee348>,<zip object at 0x7fbc4cf4d648>,0,0,0
