In [1]:
import pandas as pd
import re
import numpy as np

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, WhitespaceTokenizer, TweetTokenizer

np.random.seed(17)

In [2]:
df = pd.read_excel('source.xlsx')
x_values = df[['house', 'text']]
y_values = df['season']
xtrain, xtest, ytrain, ytest = train_test_split(x_values, y_values)

In [3]:
def clean_article(article):
    art = re.sub("[^A-Za-z0-9' ]", '', article)
    art2 = re.sub("[( ' )( ')(' )]", ' ', art)
    art3 = re.sub('\s[A-Za-z]\s', ' ', art2)
    return art3.lower()

In [4]:
bow = CountVectorizer(stop_words = 'english', preprocessor = clean_article, tokenizer = TweetTokenizer().tokenize,
                      ngram_range=(1,2), max_features = 1000, max_df = 1.0, min_df = 1, binary = False)
training_data = bow.fit_transform(xtrain.text)
test_data = bow.transform(xtest.text)

In [5]:
dftrain = pd.DataFrame(training_data.toarray())
dftrain.columns = bow.get_feature_names()
dftest = pd.DataFrame(test_data.toarray())
dftest.columns = bow.get_feature_names()
dftrain.shape

(33, 1000)

In [6]:
clf = MultinomialNB()
model = clf.fit(dftrain, ytrain)
preds = model.predict(dftest)
accuracy = accuracy_score(ytest, preds)
print('accuracy: ', accuracy)

accuracy:  0.09090909090909091


In [7]:
d = {'targets': ytest.values, 'predictions': preds}
outs = pd.DataFrame(data = d)

In [8]:
l=[]
for i in range(len(outs)):
    if outs.targets.iloc[i][0:4] == outs.predictions.iloc[i][0:4]:
        l.append(1)
    else:
        l.append(0)
outs['right_year'] = l

In [9]:
outs

Unnamed: 0,targets,predictions,right_year
0,2018S,2018F,1
1,2010F,2018F,0
2,2010S,2018F,0
3,2010S,2018S,0
4,2010S,2010F,1
5,2010S,2010F,1
6,2018F,2018S,1
7,2010S,2010F,1
8,2010S,2010F,1
9,2018S,2018S,1


In [10]:
acc = sum(outs.right_year)/len(outs.right_year)
print('right-year accuracy: ', acc)

right-year accuracy:  0.7272727272727273


In [11]:
df = pd.read_excel('source.xlsx')
x_values = df[['house', 'text', 'season']]
y_values = df['this_year']
xtrain, xtest, ytrain, ytest = train_test_split(x_values, y_values)

In [12]:
bow = CountVectorizer(stop_words = 'english', preprocessor = clean_article, tokenizer = TweetTokenizer().tokenize,
                      ngram_range=(1,2), max_features = 1000, max_df = 1.0, min_df = 1, binary = False)
training_data = bow.fit_transform(xtrain.text)
test_data = bow.transform(xtest.text)

In [13]:
dftrain = pd.DataFrame(training_data.toarray())
dftrain.columns = bow.get_feature_names()
dftest = pd.DataFrame(test_data.toarray())
dftest.columns = bow.get_feature_names()
dftrain.shape

(33, 1000)

In [14]:
clf = MultinomialNB()
model = clf.fit(dftrain, ytrain)
preds = model.predict(dftest)
accuracy = accuracy_score(ytest, preds)
print('accuracy: ', accuracy)

accuracy:  0.8181818181818182


In [15]:
feature_names = np.array(list(dftrain.columns.values))
n=20
coefs_with_fns = sorted(zip(model.coef_[0], feature_names))
top = list(zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]))

In [17]:
table1 = pd.DataFrame(columns=['informative for 2010', 'coefficient for 2010', 
                               'informative for 2018', 'coefficient for 2018'])
for i in top:
    table1 = table1.append({'informative for 2010': i[0][1], 'coefficient for 2010': i[0][0],
                           'informative for 2018': i[1][1], 'coefficient for 2018': i[1][0]}, ignore_index=True)


%store table1
table1

Stored 'table1' (DataFrame)


Unnamed: 0,informative for 2010,coefficient for 2010,informative for 2018,coefficient for 2018
0,applied,-8.094073,fashion,-4.628337
1,army,-8.094073,like,-5.049551
2,balanced,-8.094073,collection,-5.098341
3,barely,-8.094073,said,-5.149634
4,basket,-8.094073,clothes,-5.149634
5,beach,-8.094073,time,-5.386023
6,belongs,-8.094073,way,-5.455016
7,brain,-8.094073,people,-5.455016
8,brown,-8.094073,going,-5.455016
9,business,-8.094073,theres,-5.609166
