# Sentiment Classification Project

In [1]:
import numpy as np

# Load data

In [2]:
tweets = []
labels = []

def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)
    
load_tweets('twitter-datasets/train_neg_full.txt', 0)
load_tweets('twitter-datasets/train_pos_full.txt', 1)

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)
labels = np.array(labels)

print(f'{len(tweets)} tweets loaded')

2500000 tweets loaded


# Build validation set
We use 90% of tweets for training, and 10% for validation

In [3]:
np.random.seed(1) # Reproducibility!

shuffled_indices = np.random.permutation(len(tweets))
split_idx = int(0.9 * len(tweets))
train_indices = shuffled_indices[:split_idx]
val_indices = shuffled_indices[split_idx:]

len(train_indices), len(val_indices)

(2250000, 250000)

# Bag-of-words baseline

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# We only keep the 5000 most frequent words, both to reduce the computational cost and reduce overfitting
vectorizer = CountVectorizer(max_features=5000)

# Important: we call fit_transform on the training set, and only transform on the validation set
X_train = vectorizer.fit_transform(tweets[train_indices])
X_val = vectorizer.transform(tweets[val_indices])

Y_train = labels[train_indices]
Y_val = labels[val_indices]

Now we train a logistic classifier...

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e5, max_iter=100)
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0)

In [6]:
Y_train_pred = model.predict(X_train)
Y_val_pred = model.predict(X_val)

train_accuracy = (Y_train_pred == Y_train).mean()
val_accuracy = (Y_val_pred == Y_val).mean()

In [7]:
print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Accuracy (training set): 0.80527
Accuracy (validation set): 0.80324


# Model interpretation

In [8]:
model_features = model.coef_[0]
sorted_features = np.argsort(model_features)
top_neg = sorted_features[:10]
top_pos = sorted_features[-10:]

mapping = vectorizer.get_feature_names()

print('---- Top 10 negative words')
for i in top_neg:
    print(mapping[i], model_features[i])
print()

print('---- Top 10 positive words')
for i in top_pos:
    print(mapping[i], model_features[i])
print()

---- Top 10 negative words
paperback -7.733715089476916
hardcover -6.749721263599857
sadtweet -4.022199848355659
audio -3.8849876465208113
misc -3.7553966613158702
depressing -3.63732789050843
gutted -3.5956754364460863
wahhh -3.521614632401248
apparel -3.217069805985382
fml -3.1400132802859333

---- Top 10 positive words
thx 2.057920021771283
cantsayno 2.059860424345465
blessed 2.1638415390167096
smiling 2.195291992774262
worries 2.3181506563261367
ifindthatattractive 2.4271197000353912
harrypotterchatuplines 2.4633027181285185
smartnokialumia 3.1312606562595673
waystomakemehappy 3.382280627938651
yougetmajorpointsif 4.349066000550539

