In [2]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

import keras

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from tokenizer import tokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from functions import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
tweet_pos = pd.read_csv('data/train_pos_full.txt', header = None, sep = "\r\n", engine = 'python')
tweet_neg = pd.read_csv('data/train_neg_full.txt', header = None, sep = "\r\n", engine = 'python')
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0

tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']

all_tweets = tweet_neg.append(tweet_pos)
all_tweets = all_tweets.reset_index().drop(['index'], axis = 1)

all_tweets.head()

Unnamed: 0,tweet,pred
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0


In [6]:
# Stop words for tweets
stopw = pd.read_csv('data/twitter-stopwords.txt').values.flatten().tolist()

In [7]:
tweet_test = pd.read_csv('data/test_data.txt', header = None, sep = "\r\n", engine = 'python')
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [8]:
def tokenize(t):
    tweet_tok = TweetTokenizer()
    tokens = tweet_tok.tokenize(t)
    wnl = WordNetLemmatizer()
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(wnl.lemmatize(item)))
    return stems

In [9]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,2))

In [10]:
X = vectorizer.fit_transform(all_tweets.values[:, 0])
Y = all_tweets.values[:, 1].astype(int)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

accs = []

#the grid of regularization parameter 
grid = [0.01, 0.1, 1, 10, 100, 1000, 10000]

for c in grid:
    
    #initialize the classifier
    clf = LogisticRegression(solver='lbfgs', C=c, max_iter = 150)
    
    #crossvalidate
    scores = cross_val_score(clf, x_train, y_train, cv = 10)
    accs.append(np.mean(scores))

plt.plot(accs)
plt.xticks(range(len(grid)), grid)
plt.xlabel('Regularization parameter \n (Low - strong regularization, High - weak regularization)')
plt.ylabel('Crossvalidation accuracy')
plt.ylim([0.986,1])

In [12]:
clf = LogisticRegression(solver='lbfgs')
model = clf.fit(x_train, y_train)
y_pred = model.predict(x_test)



In [13]:
accuracy_score(y_test, y_pred)

0.86199

In [14]:
X_TE = vectorizer.transform(tweet_TE)

In [15]:
y_pred_TE = model.predict(X_TE)

In [16]:
build_submission(y_pred_TE, '_full_dataset')