In [2]:
import re
import collections
from operator import itemgetter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

In [3]:
############### Reading & Removing Noise from comments #################
def read_training_data(training_file):
    f = open(training_file)
    f.readline()

    data = []
    labels = []
    for row in f:
        row = row.strip().split("\"\"\"")
        label = row[0].split(",")[0]
        text = row[1]       
        data.append(re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text)))
        labels.append(float(label))
    return {"data":data,"labels":labels}

def read_test_data1(test_file):
    f = open(test_file)
    f.readline()

    data = []
    labels = []
    for row in f:
        
        row = row.strip().split("\"\"\"")
        label = row[0].split(",")[0]
        text = row[1]       
        data.append(re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text)))
        labels.append(float(label))
    return {"data":data,"labels":labels}

def read_test_data2(test_file):
    f = open(test_file)
    f.readline()

    data = []
    labels = []
    for row in f:
        
        row = row.strip().split("\"\"\"")
        label = row[0].split(",")[1]
        text = row[1]       
        data.append(re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text)))
        labels.append(float(label))
    return {"data":data,"labels":labels}

############### Feature Engineering ######################################
def insultWords(text):
    f = open('Bad words.txt')
    f.readline()
    count=0
    data=""
    for row in f:
        text = row       
        if count==0:
            data=re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text))
            count+=1
        else:
            data="|" + re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text))
    return len(re.findall(data,text.lower() ))
    
def negWords(text):
    f = open('neg words.txt')
    f.readline()
    count=0
    data=""
    for row in f:
        text = row       
        if count==0:
            data=re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text))
            count+=1
        else:
            data="|" + re.sub('_|\.',' ',re.sub("\n|,|\'|\"","",text))
    return len(re.findall(data,text.lower() ))

def exaggeration(text):
    return (len(re.findall("\?|!",text )))

def extract_features(texts, feature_functions):
    return [[f(es) for f in feature_functions] for es in texts]   


In [4]:
#################### Training ###################################
print("Reading Training Data")
training = read_training_data("data/train.csv")

#################### Extract Features ###########################
feature_functions = [insultWords,negWords,exaggeration]
features = extract_features(training["data"],feature_functions)

#################### Calculating TFIDF Vector ###################   
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),token_pattern=ur'\b\w+\b',stop_words=None, min_df=3)
tfidf_features = tfidf_vectorizer.fit_transform(training["data"])
        
#################### LogisticRegression #########################    
lr = LogisticRegression(C=.1, class_weight=None, dual=False,fit_intercept=True, intercept_scaling=1, penalty='l2',tol=0.0001)
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=ur'\b\w+\b',stop_words=None, min_df=3,binary=True)
text_features = bigram_vectorizer.fit_transform(training["data"]) 
lr.fit(text_features,training["labels"])
lr_preds = lr.predict_proba(text_features)

#################### Multinomial Naive Bayes ####################
nb = MultinomialNB()
nb.fit(tfidf_features, training["labels"])
nb_preds = nb.predict_proba(tfidf_features)  

#################### RandomForest Classifiers ###################
rf = RandomForestClassifier(n_estimators=500)
rf.fit(features,training["labels"]) 
rf_preds = rf.predict_proba(features)

#################### GradientBoosting Classifier ################
gb_features = np.empty((len(lr_preds),2))
for i in range(len(lr_preds)):
    gb_features[i][0] = (lr_preds[i][1])
    #gb_features[i][1] = (rf_preds[i][1])
    gb_features[i][1] = (nb_preds[i][1])
    
gb = GradientBoostingClassifier(n_estimators=200)
gb.fit(gb_features,training["labels"])

print "Training Completed"

Reading Training Data
Training Completed


In [5]:
#################### Testing DataSet1 ####################################
print("Reading Test Data")
test = read_test_data1("data/test_with_solutions.csv")

text_features_test = bigram_vectorizer.transform(test["data"])    
tfidf_features_test = tfidf_vectorizer.transform(test["data"])
features = extract_features(test["data"],feature_functions)

lr_preds = lr.predict_proba(text_features_test)
rf_preds = rf.predict_proba(features) 
nb_preds = nb.predict_proba(tfidf_features_test)

gb_features = np.empty((len(lr_preds),2))
   
for i in range(len(lr_preds)):
    gb_features[i][0] = (lr_preds[i][1])
    #gb_features[i][1] = (rf_preds[i][1])
    gb_features[i][1] = (nb_preds[i][1])
    
predictions = gb.predict_proba(gb_features)
predict=np.zeros(len(predictions))
for i in range(len(predictions)):
    if (predictions[i][1] >= predictions[i][0]):
        predict[i]=1
    else:
        predict[i]=0

print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Test Set-1 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Reading Test Data
Matched: 2233  Outof: 2647
Test Set-1 Accuracy =  0.843596524367


In [6]:
predict=np.zeros(len(lr_preds))
for i in range(len(lr_preds)):
    if (lr_preds[i][1] >= lr_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Logistic Regression Test Set-1 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 2219  Outof: 2647
Logistic Regression Test Set-1 Accuracy =  0.838307517945


In [7]:
predict=np.zeros(len(nb_preds))
for i in range(len(nb_preds)):
    if (nb_preds[i][1] >= nb_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Naive Bayes Test Set-1 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 2089  Outof: 2647
Naive Bayes Test Set-1 Accuracy =  0.789195315451


In [8]:
predict=np.zeros(len(rf_preds))
for i in range(len(rf_preds)):
    if (rf_preds[i][1] >= rf_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Random Forest Test Set-1 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 1952  Outof: 2647
Random Forest Test Set-1 Accuracy =  0.737438609747


In [9]:
#################### Testing DataSet2 ####################################
print("Reading Test Data")
test = read_test_data2("data/impermium_verification_labels.csv")

text_features_test = bigram_vectorizer.transform(test["data"])    
tfidf_features_test = tfidf_vectorizer.transform(test["data"])
features = extract_features(test["data"],feature_functions)

lr_preds = lr.predict_proba(text_features_test)
rf_preds = rf.predict_proba(features) 
nb_preds = nb.predict_proba(tfidf_features_test)

gb_features = np.empty((len(lr_preds),2))
      
for i in range(len(lr_preds)):
    gb_features[i][0] = (lr_preds[i][1])
    #gb_features[i][1] = (rf_preds[i][1])
    gb_features[i][1] = (nb_preds[i][1]) 
       
predictions = gb.predict_proba(gb_features)
predict=np.zeros(len(predictions))
for i in range(len(predictions)):
    if (predictions[i][1] >= predictions[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Test Set-2 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Reading Test Data
Matched: 1584  Outof: 2235
Test Set-2 Accuracy =  0.708724832215


In [10]:
predict=np.zeros(len(lr_preds))
for i in range(len(lr_preds)):
    if (lr_preds[i][1] >= lr_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Logistic Regression Test Set-2 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 1515  Outof: 2235
Logistic Regression Test Set-2 Accuracy =  0.677852348993


In [11]:
predict=np.zeros(len(rf_preds))
for i in range(len(rf_preds)):
    if (rf_preds[i][1] >= rf_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Random Forest Test Set-2 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 1156  Outof: 2235
Random Forest Test Set-2 Accuracy =  0.517225950783


In [12]:
predict=np.zeros(len(nb_preds))
for i in range(len(nb_preds)):
    if (nb_preds[i][1] >= nb_preds[i][0]):
        predict[i]=1
    else:
        predict[i]=0
print "Matched:",np.sum(predict==test["labels"])," Outof:",len(test["labels"])
print "Naive Bayes Test Set-2 Accuracy = ", np.sum(predict==test["labels"])/float(len(test["labels"]))

Matched: 1351  Outof: 2235
Naive Bayes Test Set-2 Accuracy =  0.604474272931
