In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from collections import Counter

# Download files, set up folder, put files into folder

In [2]:
# training data: ./train.tsv
# test data:     ./test.tsv

# Load training and test data

In [3]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')
print(dataframe)

       label                                             review
0          0  Leaks: Liss seems to be totally incompetent: m...
1          1  Replacement Peeler: Loved my old one. Loaned i...
2          0  Not what I was expecting: I chose to rate this...
3          1  Watch face is hard to read: Although I don't o...
4          0  Disappointing: I was eager to read this book s...
...      ...                                                ...
29991      1  Love EW: I must admit that I am a total TV afi...
29992      1  Easy to follow and delicious recipes!: I compl...
29993      1  The Beauty and Mystery of Veronique: Perhaps t...
29994      1  I love it.: Brilliant, hilarious, quick and ea...
29995      0  broken...: bad choice...2d film would not play...

[29996 rows x 2 columns]


In [4]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 23997
validation set size: 5999


In [5]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')
print (test_dataframe)

        id                                             review
0        1  Human Hurricane!: Would you like to sleep in t...
1        2  A Mom: I bought this with all kinds of expecta...
2        3  Good Read: I judge all books that I read by a ...
3        4  It's awesome: DVD set is exactly what you'd bu...
4        5  Great Movie!!!: This definatly the best Godzil...
...    ...                                                ...
5995  5996  Beautiful and Spiritual: This is a very beauti...
5996  5997  Another Cash In: This cd is pure dreck and it'...
5997  5998  Concept drawings-very good: The concept drawin...
5998  5999  I hear i all the time is awsome: this is great...
5999  6000  Not so great Performance: This mouse is very s...

[6000 rows x 2 columns]


# Try the trivial baseline: predict the majority label of the training set

In [6]:
Counter(train_dataframe['label'])

Counter({0: 11965, 1: 12032})

In [7]:
# Looks like label 1 has slightly more counts than label 0 in training data
# So the 'majority guess' prediction is an array filled with 1s
majority_guess_pred = [1 for i in range(len(valid_dataframe))]
accuracy = accuracy_score(valid_dataframe['label'], majority_guess_pred)
print ('Majority guess accuracy:', accuracy)

Majority guess accuracy: 0.5099183197199533


In [8]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [9]:
majority_guess_pred_test = [1 for i in range(len(test_dataframe))]
write_test_prediction(test_dataframe, majority_guess_pred_test, './majority_guess.csv')

6000 predictions are written to ./majority_guess.csv


# Build feature extractor

## use all unigrams from training data as features

In [10]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, binary=True)
vectorizer.fit(train_dataframe['review'])

CountVectorizer(binary=True, max_df=0.95, min_df=2, ngram_range=(1, 2))

# Extract feature vectors for training, validation, and test data 

In [11]:
train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])
print (train_X.shape)
print (valid_X.shape)
print (test_X.shape)

(23997, 166464)
(5999, 166464)
(6000, 166464)


In [12]:
train_y = train_dataframe['label'].to_numpy()
valid_y = valid_dataframe['label'].to_numpy()
print (train_y.shape)
print (valid_y.shape)

(23997,)
(5999,)


## Use chi-square statistic to select a subset of features

In [13]:
num_features_to_select = 5000
feature_selector = SelectKBest(score_func = chi2, k = num_features_to_select)
feature_selector.fit(train_X, train_y)

# feature names
all_features = [feature for feature, index in sorted(vectorizer.vocabulary_.items(), key = lambda x: x[1])]
selected_features = feature_selector.get_feature_names_out(input_features = all_features)

In [14]:
train_X_selected = feature_selector.transform(train_X)
valid_X_selected = feature_selector.transform(valid_X)
test_X_selected = feature_selector.transform(test_X)
print (train_X_selected.shape)
print (valid_X_selected.shape)
print (test_X_selected.shape)

(23997, 5000)
(5999, 5000)
(6000, 5000)


# Train model on training set

In [15]:
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X_selected, train_y)

LogisticRegression(C=1, solver='liblinear')

# Evaluate model on training set

In [16]:
train_y_hat = model.predict(train_X_selected)
accuracy = accuracy_score(train_y, train_y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

Logistic regression, accuracy on training set: 0.9598699837479685


# Evaluate model on validation set

In [17]:
valid_y_hat = model.predict(valid_X_selected)
accuracy = accuracy_score(valid_y, valid_y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)

Logistic regression, accuracy on validation set: 0.9034839139856643


# After experimentation on the validation set: retrain the final model on all training data, and predict labels for test data

In [18]:
all_train_X = vectorizer.transform(dataframe['review'])
all_train_X_selected = feature_selector.transform(all_train_X)
all_train_y = dataframe['label'].to_numpy()

model.fit(all_train_X_selected, all_train_y)
test_y_hat = model.predict(test_X_selected)
write_test_prediction(test_dataframe, test_y_hat, './logistic_regression.csv')

6000 predictions are written to ./logistic_regression.csv


# Investigate what the model has learned and where it failed (A.K.A. error analysis)

## Look at learned parameters (for linear model: weight of each dimension)

In [19]:
# construct a mapping: word -> learned weight of this word
feature_weight = {}
for idx, feature in enumerate(selected_features):
    feature_weight[feature] = model.coef_[0][idx]

In [20]:
# words correlated with positive sentiment (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = True)[:20]:
     print ('"{}"'.format(k), v)

"yet but" 2.0904161153475957
"refreshing" 2.0898236877410414
"just what" 2.0805227872007483
"only problem" 2.010323911102065
"hooked" 1.9207650463988215
"awesome" 1.9020337563727694
"own the" 1.8899042575333134
"worried" 1.8751775423358847
"not disappointed" 1.872033346153347
"is must" 1.8162417119208776
"loves this" 1.7965671777130785
"love it" 1.7899655940934682
"pleasantly" 1.695711751891211
"very interesting" 1.682476658140149
"neat" 1.6595996077160493
"love this" 1.6070021740522649
"convenient" 1.5992719620885774
"rocks" 1.5832041915767194
"from getting" 1.578325248557814
"whether you" 1.5613536637847947


In [21]:
# words correlated with negative sentiments (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = False)[:20]:
     print ('"{}"'.format(k), v)

"not good" -2.8426231314306194
"not recommend" -2.5723959827292164
"two stars" -2.517774482607159
"not worth" -2.4606801128022737
"disappointing" -2.4519542898454447
"worthless" -2.264593724686156
"alas" -2.2634422710315194
"overrated" -2.2535685985667175
"terrible" -2.1549726479119236
"poor" -1.987966027010185
"worst" -1.946719402071601
"unsatisfying" -1.9420694488164494
"item because" -1.9223714504479739
"dissapointing" -1.9010157512915464
"trash" -1.8674090261358898
"uninteresting" -1.8670661845335175
"only good" -1.7894340178773542
"pass on" -1.7884616191289573
"awful" -1.7808215225283373
"save your" -1.7758026827127222


## Look at how the model makes predictions on individual examples

In [22]:
# We pick a set of examples from the validation set (we predicted scores for those).
# We usually we don't pick from training data (since the good performance may be unrealistic).
# We cannot do error analysis on test data （because no true target value is provided）.

In [23]:
def explain_linear_prediction(df, model, idx2feature, X, y, y_hat, idx_list):
    print('indices:', idx_list)
    for idx in idx_list:
        print ('==============', idx, '================')
        print ('document:', df.iloc[idx]['review'])
        print ('TRUE label:', df.iloc[idx]['label'])
        print ('PRED label:', y_hat[idx])
        
        print ('\nPRED breakdown:')
        print ('\tINTERCEPT', model.intercept_)
        if X[idx, :].nnz == 0:
            print ('\tFEATURE', '[EMPTY]')
        else:
            sp_row = X[idx, :]
            for i in range(sp_row.getnnz()): # looping over a row in sparse matrix 
                feature_value = sp_row.data[i]
                feature_dim = sp_row.indices[i]
                print ('\tFEATURE', idx2feature[feature_dim], ':', feature_value, '*', model.coef_[0][feature_dim])

In [24]:
# construct a dictionary mapping: feature index -> word
idx2feature = dict([(idx, feature) for idx, feature in enumerate(selected_features)])

# look at data with prediction error
error_indices  = [i for i in range(len(valid_y_hat)) if valid_y_hat[i] != valid_y[i]]
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X_selected, valid_y, valid_y_hat, np.random.choice(error_indices, size = 1))

indices: [4306]
document: A Far Cry from Better Work: Read "Working" by Studs Terkel.Barbara Ehrenreich's journalistic method is biased and flawed, her intentions are good but this subject is wasted on her talents and approach. Barbara's too-late realization of what it's like to work a minimum wage job is what fostered her spewing of a laundry list of complaints with little or no advice on how to alter the system she abhors. For someone who doesn't know how to do her homework, one would have expected Barbara to have worked more of these jobs before she was 18, and I know plenty of 18-year-olds who know how to yammer better.
TRUE label: 0
PRED label: 1

PRED breakdown:
	INTERCEPT [0.01566016]
	FEATURE and : 1 * 0.3282098284948644
	FEATURE approach : 1 * -0.07577257963029782
	FEATURE before : 1 * 0.1302199030770126
	FEATURE before she : 1 * -0.7404295562120137
	FEATURE better : 1 * -0.2808338279875675
	FEATURE but : 1 * 0.04668107626324463
	FEATURE but this : 1 * -0.4031311161522661
	FEA