In [None]:
Project Group:
    - Aranza Chaparro
    - Diego Pettorssi
    - Nicholass Anderson

In [1]:
import csv
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
np.random.seed(42)
import random
random.seed(42)

#####################################################
#Prep Work: instantiate Lexicon class and Load Data #
#####################################################

# Load the training datasets into two lists (x_txt; y)
## 1. Create Empty Lists to store the strings from "x_txt" and "y"
## 2. Use a loop to load in training data into "x_txt" and "y" respectively
## 3. use train_test_split on x_txt and y to split the dataset

## Load in Train Data
x_txt = []
y = []

with open('train.tsv', encoding = "utf8") as in_file1: #is a TSV file
    iCSV1 = csv.reader(in_file1, delimiter = '\t', quoting = csv.QUOTE_NONE)
    for row in iCSV1:
        x_txt.append(row[1])
        y.append(row[-1])
        
print(len(x_txt), len(y))

in_file1.close()

## Load in Test Data
x_txt_test2 = []
y_test2 = []

with open('test.tsv', encoding = "utf8") as in_file2: #is a TSV file
    iCSV2 = csv.reader(in_file2, delimiter = '\t', quoting = csv.QUOTE_NONE)
    for row in iCSV2:
        x_txt_test2.append(row[1])
        y_test2.append(row[-1])

#Create Numpy Arrays for x and y data
x_txt = np.array(x_txt)
y = np.array(y)

#Split twitdata_TEST.tsv into Training and Test Set
x_txt_train, x_txt_test, y_train, y_test = train_test_split(x_txt, y, test_size = 0.2) #Split dataset

#Print example to show what Tweets Look Like
x_txt_test2[2]

10592 10592


'@USER @USER @USER @USER @USER @USER @USER Except you kind of are when it comes to gun control'

In [2]:
###########################################
#Running Initial Model: No Features Added #
###########################################

# Summary:
# 1. Convert X_txt_train and X_txt_test to matricies of numbers (i.e., use CountVectorizer)
vec = CountVectorizer(ngram_range = (1,1), min_df = 3)

x_train = vec.fit_transform(x_txt_train) # This should be a matrix
x_test = vec.transform(x_txt_test) # This should be a matrix


print(type(x_train)) #Confirm if Matrix
print(type(x_test)) #Confirm if Matrix
print(x_train.shape, x_test.shape) #Check Dimensions
print(x_train.shape, x_test.shape) #Check Dimensions

# Initialize the classifier LinearSVC 
svc = LinearSVC()

# Create the params with the C values
params = {"C": [0.51]}

# Initialize GridSearchCV
clf = GridSearchCV(svc, params, cv = 5) #use 5-fold CV per instructions

# "fit" the model  on X_train
clf.fit(x_train, y_train)

validation_score = clf.best_score_ # Get the score from the GridSearchCV "best score"

svm_test_predictions = clf.predict(x_test) # "predict" on X_test 

# Use svm_test_predictions and y_test to run precision_score, recall_score, and f1_score
precision = precision_score(svm_test_predictions, y_test, average = 'macro')
recall = recall_score(svm_test_predictions, y_test, average = 'macro')
f1 = f1_score(svm_test_predictions, y_test, average = 'macro')

print("Initial Validation F1: {:.4f}".format(validation_score))
print("Initial Precision: {:.4f}".format(precision))
print("Initial Recall: {:.4f}".format(recall))
print("Initial F1: {:.4f}".format(f1))


<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
(8473, 4584) (2119, 4584)
(8473, 4584) (2119, 4584)




Initial Validation F1: 0.7051
Initial Precision: 0.4687
Initial Recall: 0.5017
Initial F1: 0.4750




In [4]:
# features
class NewClassifier():
    def __init__(self):
        self.profanity_words = set()
        with open('profanity_words.txt') as iFile:
            for row in iFile:
                self.profanity_words.add(row.strip())

    def names(self, sentence):
        pred = 'NOT'
        num_punctuation = 0
        capitalized_words = 0
        names = False
        for i in range(1,len(newlist),1):
            if sentence[i] in [".","?","!"] and sentence[i-1] not in [".","?","!"]:
                num_punctuation += 1
        
        for word in sentence.split():
            if re.search("[A-Z]*", word):
                capitalized_words += 1
        
        if capitalized_words > num_punctuation:
            names = True

        for word in sentence.lower().split():
            if word in self.profanity_words:
                pred = 'UNT'
            break
            
        if pred == "TIN" and names:
            pred = "TIN"
            
        return pred
    
    def find_names(self, sentence):
        num_punctuation = 0
        capitalized_words = 0
        names = False
        newlist= list(sentence)
        for ind in range(1,len(newlist),1):
            if sentence[ind] in [".","?","!"] and sentence[ind-1] not in [".","?","!"]:
                num_punctuation += 1
        
        for word in sentence.split():
            if re.search("[A-Z][a-z]*", word):
                capitalized_words += 1  
        
        if capitalized_words > num_punctuation:
            names = True
            
        return names
    
    def count_exclamation_marks(self, sentence):
        num_excl_marks = 0
        for char in list(sentence):
            if char == "!":
                num_excl_marks += 1
        return num_excl_marks
    
    def count_ellipses(self, sentence):
        num_ellipses = 0
        for word in sentence.lower().split():
            if word in ('...'):
                num_ellipses += 1
        return num_ellipses
    
    def contain_profanity(self, sentence):
        bad_word = False
        for word in sentence.lower().split():
            if word in self.profanity_words:
                bad_word = True
        return bad_word
 

In [5]:
#Creating List of Lists for New Features

new_cls = NewClassifier()

#############################
#Load Feature 1: [Names]   #
############################

x_names_train = []
x_names_test = []
x_names_test2 = []

for tweet in x_txt_train:
    newlist1 = []
    newlist1.append(new_cls.find_names(tweet))
    x_names_train.append(newlist1)
    

for tweet in x_txt_test:
    newlist1 = []
    newlist1.append(new_cls.find_names(tweet))
    x_names_test.append(newlist1)    
    
for tweet in x_txt_test2:
    newlist1 = []
    newlist1.append(new_cls.find_names(tweet))
    x_names_test2.append(newlist1)    
    
########################################
#Load Feature 2: [Exclamation Marks]   #
########################################

x_excl_train = []
x_excl_test = []
x_excl_test2 = []


for tweet in x_txt_train:
    newlist2 = []
    newlist2.append(new_cls.count_exclamation_marks(tweet))
    x_excl_train.append(newlist2)

for tweet in x_txt_test:
    newlist2 = []
    newlist2.append(new_cls.count_exclamation_marks(tweet))
    x_excl_test.append(newlist2)
    
for tweet in x_txt_test2:
    newlist2 = []
    newlist2.append(new_cls.count_exclamation_marks(tweet))
    x_excl_test2.append(newlist2)
    
########################################
#Load Feature 3: [Ellipses]            #
########################################

x_ellipses_train = []
x_ellipses_test = []
x_ellipses_test2 = []


for tweet in x_txt_train:
    newlist3 = []
    newlist3.append(new_cls.count_ellipses(tweet))
    x_ellipses_train.append(newlist3)

for tweet in x_txt_test:
    newlist3 = []
    newlist3.append(new_cls.count_exclamation_marks(tweet))
    x_ellipses_test.append(newlist3)
    
for tweet in x_txt_test2:
    newlist3 = []
    newlist3.append(new_cls.count_exclamation_marks(tweet))
    x_ellipses_test2.append(newlist3)   

In [6]:
import scipy.sparse as sp
from scipy.sparse import hstack

##################################
#Add Features to Dataset         #
##################################   

#FEATURE 1 (Diego - Names): Convert X_train_lexicon_features1 and X_test_lexicon_features1 to numpy arrays
x_names_train = np.array(x_names_train)
x_names_test = np.array(x_names_test)
x_names_test2 = np.array(x_names_test2)

print(type(x_names_train)) #Confirm if numpy array
print(type(x_names_test)) #Confirm if numpy array
print(x_names_train.shape) #Check Dimensions of Feature 1 Train - Names
print(x_names_test.shape) #Check Dimensions of Feature 1 Test - Names


#FEATURE 2 (Diego - Exclaimation Marks): Convert X_train_lexicon_features1 and X_test_lexicon_features1 to numpy arrays
x_excl_train = np.array(x_excl_train)
x_excl_test = np.array(x_excl_test)
x_excl_test2 = np.array(x_excl_test2)

print(type(x_excl_train)) #Confirm if numpy array
print(type(x_excl_test)) #Confirm if numpy array
print(x_excl_train.shape) #Check Dimensions of Feature 2 Train - Exclaimation Points
print(x_excl_test.shape) #Check Dimensions of Feature 2 Test - Exclaimation Points

#FEATURE 3 (Aranza - Ellipses)
x_ellipses_train = np.array(x_ellipses_train)
x_ellipses_test = np.array(x_ellipses_test)
x_ellipses_test2 = np.array(x_ellipses_test2)

print(type(x_ellipses_train)) #Confirm if numpy array
print(type(x_ellipses_test)) #Confirm if numpy array
print(x_ellipses_train.shape) #Check Dimensions of Feature 3 Train - Ellipses
print(x_ellipses_test.shape) #Check Dimensions of Feature 3 Test - Ellipses 

#Prepare for Vectorization
vec = CountVectorizer(ngram_range = (1,1), min_df = 10)

x_train = vec.fit_transform(x_txt_train) 
x_test = vec.transform(x_txt_test)
x_test2 = vec.transform(x_txt_test2)

#FEATURE 1 (Diego - Names) Train: "hstack" x_train with x_names_train
x_train_w_names = hstack([x_train, x_names_train])
#FEATURE 1 (Diego - Names) Test: "hstack" x_test with x_names_test
x_test_w_names = hstack([x_test, x_names_test])
#FEATURE 1 
x_test_w_names2 = hstack([x_test2, x_names_test2])

#FEATURE 2 (Diego - Exclaimation Points) Train:
x_train_w_names_exclaim = hstack([x_train_w_names, x_excl_train])
#FEATURE 2 (Diego - Exclaimation Points) Test:
x_test_w_names_exclaim = hstack([x_test_w_names, x_excl_test])
#FEATURE 2
x_test_w_names_exclaim2 = hstack([x_test_w_names2, x_excl_test2])

#FEATURE 3 (Aranza - Ellipses) Train:
x_train_w_names_exclaim_ell = hstack([x_train_w_names_exclaim, x_ellipses_train])
#FEATURE 3 (Aranza - Ellipses) Test:
x_test_w_names_exclaim_ell = hstack([x_test_w_names_exclaim, x_ellipses_test])
#FEATURE 3
x_test_w_names_exclaim_ell2 = hstack([x_test_w_names_exclaim2, x_ellipses_test2])

# Initialize the classifier LinearSVC 
svc = LinearSVC()

# Create the params with the C values
params = {"C": [0.51]} 

# Initialize GridSearchCV
clf = GridSearchCV(svc, params, cv = 5) #use 5-fold CV per instructions

# "fit" the model x_train_w_names_exclaim_ell (From Validation Split)
clf.fit(x_train_w_names_exclaim_ell, y_train)

##############################################################
#Benchmark improvements from adding features to initial model#
##############################################################

svm_offensive = clf.predict(x_test_w_names_exclaim_ell) # Get predictions on x_train_w_names_exclaim_ell (From train_test_split)
validation_score = clf.best_score_
precision_2 = precision_score(svm_offensive, y_test, average = 'macro') # Get scores using svm_test_predictions and y_test with the precision_score method
recall_2 = recall_score(svm_offensive, y_test, average = 'macro')
f1_Micro2 = f1_score(svm_offensive, y_test, average = 'micro')
f1_Macro2 = f1_score(svm_offensive, y_test, average = 'macro')

print(clf.best_params_)
print("Validation F1: {:.4f}".format(validation_score))
print("Precision: {:.4f}".format(precision_2))
print("Recall: {:.4f}".format(recall_2))
print("F1 Micro: {:.4f}".format(f1_Micro2))
print("F1 Macro: {:.4f}".format(f1_Macro2))
print("")

#################################################
# Compare Initial Model to New Model + Features #
#################################################

assert(f1_Macro2 > f1)
print("The F1 Macro Score has improved in the new model!")
print("F1 Macro score has improved by {:.4f}".format(f1_Macro2 - f1))
print("")

#################################################
#Use Model to Generate Predictions for Test.TSV #
#################################################
svm_offensive2 = clf.predict(x_test_w_names_exclaim_ell2) # Get predictions on x_train_w_names_exclaim_ell2
validation_score2 = clf.best_score_
precision_3 = precision_score(svm_offensive2, y_test2, average = 'macro') # Get scores using svm_offensive2 and y_test2 with the precision_score method
recall_3 = recall_score(svm_offensive2, y_test2, average = 'macro')
f1_Micro3 = f1_score(svm_offensive2, y_test2, average = 'micro')
f1_Macro3 = f1_score(svm_offensive2, y_test2, average = 'macro')

print(clf.best_params_)
print("Validation F1: {:.4f}".format(validation_score2))
print("Precision: {:.4f}".format(precision_3))
print("Recall: {:.4f}".format(recall_3))
print("F1 Micro: {:.4f}".format(f1_Micro3))
print("F1 Macro: {:.4f}".format(f1_Macro3))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(8473, 1)
(2119, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(8473, 1)
(2119, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(8473, 1)
(2119, 1)




{'C': 0.51}
Validation F1: 0.7081
Precision: 0.4682
Recall: 0.5233
F1 Micro: 0.6951
F1 Macro: 0.4801

The F1 Macro Score has improved in the new model!
F1 Macro score has improved by 0.0051

{'C': 0.51}
Validation F1: 0.7081
Precision: 0.2480
Recall: 0.3333
F1 Micro: 0.7440
F1 Macro: 0.2844


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
######################################
#Add Predictions to TSV using Pandas #
######################################
import pandas as pd

data = pd.read_csv('test.tsv', header = 0, sep = '\t')
data.drop('NOT', inplace = True, axis = 1)
svm_offensive2x = svm_offensive2.tolist()
del svm_offensive2x[0]
data['NOT'] = svm_offensive2x

## File Creation commented out
# data.to_csv("test_preds.tsv", sep="\t")

In [8]:
################################################
#Sample of Predictions for test.tsv: New Model #
################################################

num_tweets = 0
for text, svm_offensive2, y in zip(x_txt_test2, svm_offensive2, y_test2):
    print("Tweet: {}".format(text))
    print("Ground-Truth Class: {}".format(y))
    print("SVM + Features Added: {}".format(svm_offensive2))
    print()
    
    num_tweets += 1
    if num_tweets == 20:
        break

Tweet: @USER Nancy Lee Grahn You Are Awesome! I have been a fan since Santa Barbara!! Alex Davis also Rocks!!!!! Thank you !!!
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER She is a Skrull. Enemy of The Kree. The Kree are who gave Carol her powers and whose uniform she is wearing in the first few moments of the trailer.
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER @USER @USER @USER @USER @USER @USER Except you kind of are when it comes to gun control
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER @USER @USER You are so beautiful♡
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER This is what happens when liberals get in control
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER @USER Daniels said her job does not reflect her character, really. She is a cheap, sleezy porn lap dancer. The is no high road" to take in that type of job"
Ground-Truth Class: NOT
SVM + Features Added: NOT

Tweet: @USER No longer o