In [1]:
def get_data_and_labels(filename):
    import pandas as pd
    
    df = pd.read_csv(filename, header=None, sep='\t')
    x = df.iloc[:,1]
    y = df.iloc[:,0]
    return x, y

In [2]:
train_x, train_y = get_data_and_labels('general-tweets.txt')

In [4]:
train_x.head(5)

0    Bumping dj sefs mixtape nowww this is my music...
1    #ieroween THE STORY OF IEROWEEN! THE VIDEO ->>...
2    trick or treating at the mall today; ZOO! last...
3    @Ussk81 PMSL!!! I try not to stare but I can't...
4    @Sc0rpi0n676 btw - is there a remote chance i ...
Name: 1, dtype: object

In [5]:
train_y.head(5)

0    NOT
1    NOT
2    NOT
3    NOT
4    NOT
Name: 0, dtype: object

In [6]:
test_x, test_y = get_data_and_labels('keyword-tweets.txt')
test_x.head(5)

0    Global Voices Online Â» Alex Castro: A liberal...
1    Do the Conservatives Have a Death Wish? http:/...
2    @MMFlint I've seen all of your movies and Capi...
3    RT @AllianceAlert: * House Dems ask for civili...
4    RT @AdamSmithInst Quote of the week: My politi...
Name: 1, dtype: object

In [7]:
test_y.head(5)

0    POLIT
1    POLIT
2      NOT
3    POLIT
4    POLIT
Name: 0, dtype: object

In [8]:
len(train_x)

2000

In [9]:
len(test_x)

2004

In [10]:
def encode_labels(labels):
    from sklearn import preprocessing
    
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

In [None]:
train_y     test_y
0 = NOT    POLIT = 0
    NOT    NOT = 1
1 = POLIT  NOT

In [13]:
# 统一label, 即0 1编码
le = encode_labels(train_y)
train_targets = le.transform(train_y)
test_targets = le.transform(test_y)

In [None]:
以下，弄什么Bag of words
Giving up my favorite vice ...    Republican
1      1  1  1        1           0

In [20]:
# 构建出features set，之前的train_targets就是labels set
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_x).toarray()
x_test_counts = count_vect.transform(test_x).toarray()
x_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
# 从confusion matrix我们可以解读到，nothing is predicted as 1, everything is predicted as 0
# This is called inbalance problem, we have got tons of inpolitical tweets but only little political tweets
# clf means classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

clf = LogisticRegression()
clf.fit(x_train_counts, train_targets)
hyp = clf.predict(x_test_counts)

print ('Accuracy:', accuracy_score(test_targets, hyp))
print ('Confusion Matrix:', confusion_matrix(test_targets, hyp))

Accuracy: 0.15618762475
Confusion Matrix: [[ 313    0]
 [1691    0]]


In [23]:
# 如何解决上述问题，how many features do we have? 因为input是matrix，所以shape后发现我们有9048个features（9048个words），2000是sample data record count
# 所以我们发现for training data, the predictor count (9048) is far more larger than the training data count (2000), so you lose the degree of freedom
x_train_counts.shape

(2000, 9048)

In [None]:
LASSO regression, also called L1 norm regression


the larger the C, the tighter the constraint.


          (optimal performance)
 |x2
/|\
-------->x1
\|/

In [25]:
# accuracy有望从之前15%提升到接近30%
C = [0.0001, 0.0001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for c in C:
    clf = LogisticRegression(penalty='l1', C=c)
    clf.fit(x_train_counts, train_targets)
    hyp = clf.predict(x_test_counts)
    accuracy = accuracy_score(test_targets, hyp)
    print ('C = ', c, '... accuracy:', accuracy)

C =  0.0001 ... accuracy: 0.15618762475
C =  0.0001 ... accuracy: 0.15618762475
C =  0.01 ... accuracy: 0.15618762475
C =  0.1 ... accuracy: 0.15618762475
C =  1 ... accuracy: 0.199600798403
C =  10 ... accuracy: 0.26996007984
C =  100 ... accuracy: 0.287425149701
C =  1000 ... accuracy: 0.289421157685
C =  10000 ... accuracy: 0.214570858283


In [27]:
# Confusion matrix for best system of the above
# better than before but still not good
clf = LogisticRegression(penalty='l1', C=100)
clf.fit(x_train_counts, train_targets)
hyp = clf.predict(x_test_counts)

print ('Confusion Matrix: ', confusion_matrix(test_targets, hyp))

Confusion Matrix:  [[ 276   37]
 [1389  302]]
