## Contribution of Chidananda Pati/012506949
## Political Affiliation Factor
## This notebook is a subset of PoliticalAffilicationDetector_Complete.ipynb that has lot of other steps like word2vec, tf-idf, LDA I tried

In [59]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [60]:
columns=['id','label','statement','subject','speaker','speaker_job','state',
        'party_affiliation','barely_true_count','false_Count',
        'half_true_count','mostly_true_count','pants_on_fire_count','venue_speach'];
df_lair=pd.read_csv('../train.tsv',sep='\t',header=None,names=columns,index_col=False);
df_lair.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party_affiliation,barely_true_count,false_Count,half_true_count,mostly_true_count,pants_on_fire_count,venue_speach
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [61]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chidanandapati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chidanandapati/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chidanandapati/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Text Proprocessing

#### Function for text preprocessing
- lowercase the text
- word tokenization
- remove stop words and non alphanumeric charaters
- stemming

In [62]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
def text_preprocessing(df_base,column):
    df=df_base.copy()
    # lowercase the text
    df[column]=df[column].str.lower()
    # word tokenization
    df[column]=df[column].map(lambda x: nltk.word_tokenize(x))
    # remove stop words and non alphanumeric charaters
    df[column]=df[column].map(lambda x: [w for w in x if (not w in stop_words) and w.isalpha()])
    # lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    df[column]=df[column].map(lambda x: [ wordnet_lemmatizer.lemmatize(w) for w in x])    
    # stemming
    porter = PorterStemmer()
    df[column]=df[column].map(lambda x: [porter.stem(w) for w in x] )
    return df

#### Calling text_preprocessing function

In [63]:
df_train=text_preprocessing(df_lair,'statement')
df_train.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party_affiliation,barely_true_count,false_Count,half_true_count,mostly_true_count,pants_on_fire_count,venue_speach
0,2635.json,false,"[say, anni, list, polit, group, support, abort...",abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,"[declin, coal, start, start, natur, ga, took, ...","energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"[hillari, clinton, agre, john, mccain, vote, g...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,"[health, care, reform, legisl, like, mandat, f...",health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,"[econom, turnaround, start, end, term]","economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


## Doc2Vec Political Affilication

In [64]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec

In [65]:
vocab_political_affiliation=['president','george','bush','administration','republican','democrats','barack','obama','hillary','clinton','donald','trump','senate','house']

In [66]:
tagged_doc_pa = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(vocab_political_affiliation)]

In [67]:
from gensim.models.doc2vec import Doc2Vec
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_doc_pa)

for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    model.train(tagged_doc_pa,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
model.save("d2v.model.pa")
print("Model Saved")   

Model Saved


In [68]:
df_train_statements_d2v=df_train[['statement','label']]
model= Doc2Vec.load("d2v.model.pa")
texts=[]
for x in df_train_statements_d2v['statement']:
    texts.append(model.infer_vector(x))

## Logistics Regression and Random Forrest for Doc2Vec 

In [69]:
def replace_label(x):
    if (x == 'true' or x=='mostly-true' or x=='half-true'):
        return 1
    else:
        return 0
replace_label('true')    

1

In [70]:
X=pd.DataFrame(texts)
#y=df_train_statements_d2v[['label']]
y=df_train_statements_d2v['label'].map(lambda x:replace_label(x))
X_train,X_test,y_train,y_test=train_test_split(X, y,test_size = .3, random_state = 1)

In [71]:
from sklearn.linear_model import LogisticRegression
logisticRegr_D2V_PA = LogisticRegression(C=100)
logisticRegr_D2V_PA.fit(X_train, y_train)
lr_pred_pa = logisticRegr_D2V_PA.predict(X_test)
import pickle
s = pickle.dumps(logisticRegr_D2V_PA)

In [72]:
from sklearn import metrics
print(metrics.classification_report(y_test,lr_pred_pa))

             precision    recall  f1-score   support

          0       0.49      0.22      0.31      1331
          1       0.58      0.82      0.68      1741

avg / total       0.54      0.56      0.52      3072



In [73]:
from sklearn.ensemble import RandomForestClassifier
rf_D2V_PA = RandomForestClassifier(n_jobs=-1,n_estimators=50,max_depth=90)
rf_D2V_PA.fit(X_train,y_train)
rf_pred_pa = rf_D2V_PA.predict(X_test)

In [74]:
from sklearn import metrics
print(metrics.classification_report(y_test,rf_pred_pa))

             precision    recall  f1-score   support

          0       0.47      0.38      0.42      1331
          1       0.59      0.68      0.63      1741

avg / total       0.54      0.55      0.54      3072



In [75]:
import pickle
def political_affiliation_checker(news):
    data_pred=[]
    data_pred.append(model.infer_vector(news))
    lrg_pa = pickle.loads(s)
    pred_conf=lrg_pa.predict_proba(data_pred)
    #print(pred_conf)
    return pred_conf[0][1]

In [76]:
class PoliticalAffilicationDetector:
    def __init__(self,news):
        self.news=news
    def predict(self):
        return political_affiliation_checker(self.news)

In [77]:
political_affiliation_checker("Says the Annies List political group supports third-trimester abortions on demand.")

0.38778942879552897

In [78]:
logisticRegr_D2V_PA.classes_

array([0, 1])