In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_pickle('data/label_renamed.pkl')

In [3]:
df['Dominant_Topic'].nunique()

11

In [4]:
df

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,hashtags
0,0,Destruction/Consequences,0.2297,"global, warming, park, stop, joshua, admin, na...","[global, wine, production, reach, new, low, ch...",climatechange wine plastics
1,1,News/Media,0.2824,"threat, news, happening, temperature, day, fac...","[im, dismissing, crazy, conspiracy, im, dismis...",
2,2,Belief/Sentiment,0.4131,"world, trump, head, president, study, way, sup...","[let, turn, thing, around]",ProtectWhatYouLove ActOnClimate Sustainability...
3,3,Arctic/Icecap,0.5074,"it, time, year, arctic, planet, weather, probl...","[new, approach, globalwarming, projection, reg...",
4,4,ClimateChangeIsReal/FightClimateChange,0.7851,"climate, change, zinke, real, human, talk, wor...","[child, estimated, bear, burden, disease, rela...",
...,...,...,...,...,...,...
2999958,2999958,Arctic/Icecap,0.2953,"it, time, year, arctic, planet, weather, probl...","[thank, important, thread, major, danger, planet]",ClimateChange
2999966,2999966,Belief/Sentiment,0.2710,"world, trump, head, president, study, way, sup...","[decision, unilaterally, dismiss, agreement, s...",
2999975,2999975,Action/Efforts/Awareness,0.0667,"amp, action, leader, tweet, thanks, away, summ...","[sarah, sander]",ALSenate RoyMoore FollowTheWhiteRabbit QAnon U...
2999981,2999981,Belief/Sentiment,0.2273,"dont, good, like, think, people, want, know, l...","[let, moment, realize, people, denying, climat...",


In [5]:
# Fix Text column (from list to string)
df['TextString'] = [' '.join(map(str, l)) for l in df['Text']]

In [6]:
df.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,hashtags,TextString
0,0,Destruction/Consequences,0.2297,"global, warming, park, stop, joshua, admin, na...","[global, wine, production, reach, new, low, ch...",climatechange wine plastics,global wine production reach new low changing ...
1,1,News/Media,0.2824,"threat, news, happening, temperature, day, fac...","[im, dismissing, crazy, conspiracy, im, dismis...",,im dismissing crazy conspiracy im dismissing t...
2,2,Belief/Sentiment,0.4131,"world, trump, head, president, study, way, sup...","[let, turn, thing, around]",ProtectWhatYouLove ActOnClimate Sustainability...,let turn thing around
3,3,Arctic/Icecap,0.5074,"it, time, year, arctic, planet, weather, probl...","[new, approach, globalwarming, projection, reg...",,new approach globalwarming projection regional...
4,4,ClimateChangeIsReal/FightClimateChange,0.7851,"climate, change, zinke, real, human, talk, wor...","[child, estimated, bear, burden, disease, rela...",,child estimated bear burden disease related cl...


In [7]:
# from sklearn.feature_extraction.text import TfidfTransformer
X_train, X_test, y_train, y_test = train_test_split(df['TextString'], df['Dominant_Topic'], random_state = 0)

In [8]:
import sklearn.metrics as metrics
lr_f1 = []

# Find f1 score for the c values below
c_value = np.concatenate([np.arange(0.01,0.1,0.01),np.arange(0.1,1,0.1), np.arange(1,11,1)])
for c in c_value:
    vectorizer = CountVectorizer()
    vtrain = vectorizer.fit_transform(X_train)
    vdev = vectorizer.transform(X_test)
    lrmodel = LogisticRegression(C=c, penalty='l2', multi_class='auto', solver='liblinear', max_iter=1000)
    lrmodel.fit(vtrain, y_train)
    dev_pred = lrmodel.predict(vdev)
    f1 = metrics.f1_score(y_test, dev_pred, average='weighted')
    lr_f1.append(f1)
    sum_sq_weight = np.square(lrmodel.coef_).sum(axis=1)
    print("\tFor c-value = %f --> F1 score = %f" % (c, f1))
    for i in range(0,4):
        print("\t\tClass %d: %f" % (i,sum_sq_weight[i]))

# Find highest F1 score
print("\n\tc = %f has the highest f1 score %f" % (c_value[np.argmax(lr_f1)], max(lr_f1)))

	For c-value = 0.010000 --> F1 score = 0.768296
		Class 0: 347.990029
		Class 1: 256.939879
		Class 2: 250.243741
		Class 3: 384.367942
	For c-value = 0.020000 --> F1 score = 0.793576
		Class 0: 583.943256
		Class 1: 438.572572
		Class 2: 417.159202
		Class 3: 669.959608
	For c-value = 0.030000 --> F1 score = 0.804864
		Class 0: 780.048937
		Class 1: 590.825878
		Class 2: 554.362216
		Class 3: 912.629120
	For c-value = 0.040000 --> F1 score = 0.810762
		Class 0: 956.190849
		Class 1: 726.446208
		Class 2: 676.693205
		Class 3: 1130.008753
	For c-value = 0.050000 --> F1 score = 0.814865
		Class 0: 1121.284086
		Class 1: 851.079604
		Class 2: 790.226015
		Class 3: 1331.116987
	For c-value = 0.060000 --> F1 score = 0.817476
		Class 0: 1279.057912
		Class 1: 968.220668
		Class 2: 898.466390
		Class 3: 1520.473839
	For c-value = 0.070000 --> F1 score = 0.819444
		Class 0: 1433.244666
		Class 1: 1080.141437
		Class 2: 1003.632588
		Class 3: 1700.035574
	For c-value = 0.080000 --> F1 score = 

In [10]:
lrmodel = LogisticRegression(C=0.3, penalty='l2', multi_class='auto', solver='liblinear', max_iter=1000)
lrmodel.fit(vtrain, y_train)
dev_pred = lrmodel.predict(vdev)

In [11]:
accu = metrics.accuracy_score(y_test, dev_pred)
print("Accuracy = %f" % accu)

Accuracy = 0.829130
