In [1]:
# Function for reading the Twitter Political Corpus data
# Data from https://www.usna.edu/Users/cs/nchamber/data/twitter/
def get_data_and_labels(filename):
    import pandas as pd
    
    df = pd.read_csv(filename, header=None, sep='\t')
    x = df.iloc[:,1]
    y = df.iloc[:,0]
    return x, y

In [6]:
# Read data; convert to features and labels
train_x, train_y = get_data_and_labels('/Users/dbrizan/Documents/workspace/CS686/general-tweets.txt')
test_x, test_y = get_data_and_labels('/Users/dbrizan/Documents/workspace/CS686/keyword-tweets.txt')

In [7]:
# Function to encode labels
# (This should only be done on the train data.)
def encode_labels(labels):
    from sklearn import preprocessing
    
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

In [9]:
# Encode the labels:
le = encode_labels(train_y)
train_targets = le.transform(train_y)
test_targets = le.transform(test_y)

In [14]:
# Use a count vectorizer to create a bag-of-words model
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_x).toarray()
x_test_counts = count_vect.transform(test_x).toarray()

In [16]:
# Now with a proper X and Y, we can train on train; test on test

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

clf = LogisticRegression()
clf.fit(x_train_counts, train_targets)
hyp = clf.predict(x_test_counts)

# ... and then determine the performance of the model
print('Accuracy:', accuracy_score(test_targets, hyp))
print('Confusion Matrix:', confusion_matrix(test_targets, hyp))

('Accuracy:', 0.15618762475049899)
('Confusion Matrix:', array([[ 313,    0],
       [1691,    0]]))


In [17]:
# Performance is mediocre. Perhaps a penalised model would be better? Try "L1 norm" with different values of "C"

C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for c in C:
    clf = LogisticRegression(penalty='l1', C=c)
    clf.fit(x_train_counts, train_targets)
    hyp = clf.predict(x_test_counts)
    accuracy = accuracy_score(test_targets, hyp)
    print('C=', c, '... accuracy:', accuracy)

('C=', 0.0001, '... accuracy:', 0.15618762475049899)
('C=', 0.001, '... accuracy:', 0.15618762475049899)
('C=', 0.01, '... accuracy:', 0.15618762475049899)
('C=', 0.1, '... accuracy:', 0.15618762475049899)
('C=', 1, '... accuracy:', 0.19960079840319361)
('C=', 10, '... accuracy:', 0.26996007984031933)
('C=', 100, '... accuracy:', 0.28942115768463073)
('C=', 1000, '... accuracy:', 0.28642714570858285)
('C=', 10000, '... accuracy:', 0.2345309381237525)


In [19]:
# Confusion matrix for best system of the above
clf = LogisticRegression(penalty='l1', C=100)
clf.fit(x_train_counts, train_targets)
hyp = clf.predict(x_test_counts)
print('Confusion Matrix:', confusion_matrix(test_targets, hyp))

('Confusion Matrix:', array([[ 276,   37],
       [1389,  302]]))


In [21]:
# Oh, how about accuracy?
print('Accuracy:', accuracy_score(test_targets, hyp))
# This is not the best way to way to determine the best value for C.
# The best way is to use cross validation. We will cover cross validation later.

('Accuracy:', 0.28842315369261479)
