# Party ID classification 

Vivek Datta initially wrote this notebook. Jae Yeon Kim reviwed the notebook, edited the markdown, and reproduced, commented on and made substantial changes in the code.

## Import libraries 

In [2]:
# Install uninstalled libs 
import sys
#!conda install --yes --prefix {sys.prefix} textblob

import pandas as pd
import pickle 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import Counter
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)

# NLTK
import re
import urllib
from textblob import TextBlob
from gensim.models import Word2Vec
from wordcloud import WordCloud
from wordcloud import STOPWORDS

import nltk as nlp
# nltk.download('punkt') You may need to download the dataset
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.text import Text  
from nltk.stem.lancaster import LancasterStemmer
from spacy.lang.en.stop_words import STOP_WORDS

# ML

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB # Naive-Bayes
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression # Linear models
from xgboost import XGBClassifier # Xgboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

################### Validation ######################
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedKFold

################### Vectorizer ######################
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA

################### Model evals #####################
from sklearn.metrics import accuracy_score, balanced_accuracy_score 

################### Imbalanced data #####################
from sklearn.utils import resample # for resampling

# Custom functions
from clean_text import clean_tweet

## Load data 

In [3]:

data = pd.read_csv('/home/jae/intersectional-bias-in-ml/raw_data/hatespeech_text_label_vote_RESTRICTED_100K.csv', sep='\t', header=None)

# Name columns 
data.columns = ['Tweet', 'label', 'votes']

data.head()

Unnamed: 0,Tweet,label,votes
0,Beats by Dr. Dre urBeats Wired In-Ear Headphon...,spam,4
1,RT @Papapishu: Man it would fucking rule if we...,abusive,4
2,It is time to draw close to Him &#128591;&#127...,normal,4
3,if you notice me start to act different or dis...,normal,5
4,"Forget unfollowers, I believe in growing. 7 ne...",normal,3


## Clean text 

In [4]:
data['Tweet'] = clean_tweet(data['Tweet'])

data.head()

Unnamed: 0,Tweet,label,votes
0,beats by dr dre urbeats wired inear headphones...,spam,4
1,man it would fucking rule if we had a party ...,abusive,4
2,it is time to draw close to him 128591127995 f...,normal,4
3,if you notice me start to act different or dis...,normal,5
4,forget unfollowers i believe in growing 7 new ...,normal,3


## Import and wrangle training data 

Vivek Datta adapted the code from [the following Jupyter notebook](https://github.com/chouhbik/Sentiment-Analysis-of-Tweets/blob/master/Tweets%20Analysis%20DemvsRep.ipynb). The original dataset comes from the [Kaggle website](https://www.kaggle.com/kapastor/democratvsrepublicantweets). Jae Yeon Kim made some changes in Vivek's code (mostly on training and evaluating algorithms). 

In [5]:
# Import data

model_data = pd.read_csv("/home/jae/intersectional-bias-in-ml/raw_data/ExtractedTweets.csv")

model_data.dropna(axis = 0, inplace = True)

model_data["Party_log"] = [1 if each == "Democrat" else 0 for each in model_data.Party]

In [6]:
# A custom list of stopwords 

STOPWORDS.add("rt")
STOPWORDS.add("s")
STOPWORDS.add("u")
STOPWORDS.add("amp")
STOPWORDS.add("th")
STOPWORDS.add("will")
STOPWORDS.add("t")
STOPWORDS.add("m")

In [7]:
# Party ID values 

democrat=model_data[model_data.Party=="Democrat"]
republican=model_data[model_data.Party=="Republican"]

In [8]:
# Clean values

## Democrats 

democrat_list=[]
for d in democrat.Tweet:
    d=re.sub(r'http\S+', '', d) #remove links
    d=re.sub("[^a-zA-Z]", " ", d) #remove all characters except letters
    d=d.lower() #convert all words to lowercase
    d=nlp.word_tokenize(d) #split sentences into word
    d=[word for word in d if not word in STOPWORDS] #remove the stopwords
    lemma=nlp.WordNetLemmatizer() 
    d=[lemma.lemmatize(word) for word in d] #identify the correct form of the word in the dictionary
    d=" ".join(d)
    democrat_list.append(d) #append words to list

## Republicans 

republican_list=[]
for r in republican.Tweet:
    r=re.sub(r'http\S+', '', r)
    r=re.sub("[^a-zA-Z]", " ", r)
    r=r.lower()
    r=nlp.word_tokenize(r)
    r=[word for word in r if not word in STOPWORDS]
    lemma=nlp.WordNetLemmatizer()
    r=[lemma.lemmatize(word) for word in r]
    r=" ".join(r)
    republican_list.append(r)

0 = Republicans, 1 = Democrats. Note that the class size is balanced. 

In [9]:
model_data['Party_log'].value_counts()

0    44392
1    42068
Name: Party_log, dtype: int64

In [10]:
model_data['Party'].value_counts()

Republican    44392
Democrat      42068
Name: Party, dtype: int64

## Feature extraction (bag-of-words model)

In [11]:
st = LancasterStemmer()

def token(text):
    txt = nlp.word_tokenize(text.lower())
    return [st.stem(word) for word in txt]


# Vectorizer

vectorizer = CountVectorizer(max_features = 4000, # 4,000 is large enough
                             min_df = 1, # minimum frequency 1
                             ngram_range = (1,2), # ngram 
                             tokenizer = token,
                             analyzer=u'word')

In [12]:
# Turn text into document-term matrix

def dtm_train(data, condition):
    
    ############################### DOCUMENT-TERM MATRIX ################################
    
    # BOW model 
    
    features = vectorizer.fit_transform(data['Tweet']).todense() # Turn into a sparse matrix    

    # Response variable
    
    response = data[condition].values # values 

    ############################### STRATIFIED RANDOM SAMPLING ################################
    
    # Split into training and testing sets 

    X_train, X_test, y_train, y_test = train_test_split(features, response, 
                                                        test_size = 0.2, # training = 80%, test = 20%
                                                        random_state = 1234) 
    
    return(X_train, y_train, X_test, y_test)

In [13]:
# Apply the function 

model_dtm = dtm_train(model_data, 'Party_log')

## Train classifiers

###  Functions for various ML models

In [13]:
# Lasso

def fit_logistic_regression(X_train, y_train):
    model = LogisticRegression(penalty = 'l1', # Lasso 
                               solver = 'liblinear') # for small datasets
    # sage solver is faster but doesn't coverge in this case
    model.fit(X_train, y_train)
    return model

# Naive-Bayes 

def fit_bayes(X_train, y_train):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

# Xgboost

def fit_xgboost(X_train, y_train):
    model = XGBClassifier(random_state = 42,
                         seed = 2, 
                         colsample_bytree = 0.6,
                         subsample = 0.7)
    model.fit(X_train, y_train)
    return model

### Function for evaluating ML models (accuracy and balanced accuracy)

In [17]:
def test_model(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
#   print("Accuracy:", accuracy, "\n"
#          "Balanced accuracy:", balanced_accuracy)
    return(accuracy, balanced_accuracy)

### Model fitting 

In [15]:
def fit_models(data):
    # Logit
    lasso = fit_logistic_regression(data[0], data[1])
    # Naive-Bayes
    bayes = fit_bayes(data[0], data[1])
    # Xgboost
    xgboost = fit_xgboost(data[0], data[1])
    
    return(lasso, bayes, xgboost)

In [16]:
model_fit = fit_models(model_dtm)

In [17]:

# Save the model object 

pickle.dump(model_fit, open('/home/jae/intersectional-bias-in-ml/processed_data/model_fit.sav', 'wb'))

In [14]:
# Load the model object 

model_fit = pickle.load(open('/home/jae/intersectional-bias-in-ml/processed_data/model_fit.sav', 'rb'))

## Model evaluations

### Function for testing multiple models

In [15]:
def test_models(models, data):
    lasso = test_model(models[0], data[0], data[1], data[2], data[3]) 
    bayes = test_model(models[1], data[0], data[1], data[2], data[3])
    xgboost = test_model(models[2], data[0], data[1], data[2], data[3])
    return(lasso, bayes, xgboost)

Evaluate multiple models for each data.

In [18]:

models = test_models(model_fit, model_dtm)

### Function for putting the model evaluations into a table

In [19]:
def eval_table(data):
    table = pd.DataFrame(list(data), columns= ['Accuracy','Balanced Accuracy'])
    table.insert(loc = 0, column = 'Models', value = ['Lasso', 'Bayes', 'XGBoost'])
    return(table)

In [20]:

eval_table(models)

Unnamed: 0,Models,Accuracy,Balanced Accuracy
0,Lasso,0.726347,0.725896
1,Bayes,0.707842,0.706522
2,XGBoost,0.70443,0.703564


## Prediction 

### Function for predicting the unlabeled data (tweets)

In [21]:
def predict_text(text, model):   
      
    # BOW model 
    
    features = vectorizer.fit_transform(text).todense()
    
    # Prediction
    
    preds = model.predict(features)
    
    return preds


### Apply the function to the tweets

In [None]:
predicted = predict_text(data['Tweet'], model_fit[0])

### Data quality check

In [None]:
data['Tweet'].isnull().values.any()

In [None]:
data['party_ID'] = predicted

In [None]:
data['party_ID'].value_counts()

In [None]:
data.head()

## Export the predicted values 

In [None]:
data.columns 

In [None]:
data.to_csv("/home/jae/intersectional-bias-in-ml/processed_data/party_ID_predictions.csv", sep=',', encoding='utf-8', 
                    header=["text", "label", "votes", "party_ID"], index=True)