In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import torch.autograd as autograd
import tqdm

from csv_handle import save_csv

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

In [33]:
source_file="data/fitocracy_posts.csv"
version = "_fb"
train_filename, valid_filename, test_filename = save_csv(source_file=source_file, version=version, full=True)

In [34]:
train = pd.read_csv('data/train_fb.csv')
valid = pd.read_csv('data/valid_fb.csv')
test = pd.read_csv('data/test_fb.csv')

In [35]:
train

Unnamed: 0,comment_text,gender
0,C-can I borrow your maneuver gear?,1
1,Thanks for the follow @SRD - following back :),1
2,thanks for the frequent props. Looks like you...,0
3,So proud of you man. I remember when I first c...,0
4,Thanks for the follow back!,1
5,Hey Man. You haven't posted a workout for qui...,0
6,just joined the Lifehacker group!,1
7,"Thanks for following me, I following back : )",1
8,you're *one* place ahead of me for 30day leade...,0
9,just joined the Fit Angels group!,1


In [36]:
# Build the classifier
text_train = train['comment_text'].values
text_test = test['comment_text'].values

y_train = train['gender'].values
y_test = test['gender'].values

In [37]:
# Vectorize sentences
vec = TfidfVectorizer(stop_words='english', max_features=2000)
X_train = vec.fit_transform(text_train)
X_test = vec.transform(text_test)

# Build the classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)



In [38]:
print("~~TRAIN~~")
print(classification_report(y_train, y_train_pred))
print("\n~~TEST~~")
print(classification_report(y_test, y_test_pred))

~~TRAIN~~
              precision    recall  f1-score   support

           0       0.63      0.70      0.66    100082
           1       0.63      0.55      0.59     91039

   micro avg       0.63      0.63      0.63    191121
   macro avg       0.63      0.63      0.62    191121
weighted avg       0.63      0.63      0.63    191121


~~TEST~~
              precision    recall  f1-score   support

           0       0.62      0.69      0.65     29483
           1       0.61      0.54      0.57     26989

   micro avg       0.62      0.62      0.62     56472
   macro avg       0.62      0.61      0.61     56472
weighted avg       0.62      0.62      0.62     56472



In [39]:
# Inspecting coefficients
coef_dict = {}
coef_dict[1] = {w: clf.coef_[0][i]
                    for w, i in vec.vocabulary_.items()}

coef_df = pd.DataFrame(coef_dict)

In [56]:
len(coef_df)

2000

In [69]:
keys = [v for v in coef_df[1].keys()]
values = [v for v in coef_df[1].values]

features_coef = list(zip(keys, values))
features_coef.sort(key=lambda x: x[1])

[('wife', -3.537588582363244),
 ('man', -3.253875539506573),
 ('mate', -3.187442413447355),
 ('bro', -2.756816824997077),
 ('bud', -2.6459608174352622),
 ('brah', -2.43189655777904),
 ('girlfriend', -2.3162790331433802),
 ('protein', -2.1699272216384182),
 ('strength', -2.0739633321855786),
 ('pullups', -2.060375023506042)]

In [70]:
features_coef[:15]

[('wife', -3.537588582363244),
 ('man', -3.253875539506573),
 ('mate', -3.187442413447355),
 ('bro', -2.756816824997077),
 ('bud', -2.6459608174352622),
 ('brah', -2.43189655777904),
 ('girlfriend', -2.3162790331433802),
 ('protein', -2.1699272216384182),
 ('strength', -2.0739633321855786),
 ('pullups', -2.060375023506042),
 ('leangains', -2.023923016108095),
 ('respect', -1.8879821965203907),
 ('dude', -1.86518992338021),
 ('brother', -1.8440284152688524),
 ('robot', -1.8409995642518546)]

In [73]:
features_coef[-15:][::-1]

[('barbie', 6.742589896854443),
 ('angels', 5.955550383015419),
 ('girls', 5.083979165241149),
 ('boyfriend', 3.570488896680144),
 ('lifters', 3.41902998798414),
 ('husband', 3.0907429694248085),
 ('hubby', 2.9774812303439737),
 ('frustrated', 2.5551286345332875),
 ('xx', 2.3724947237243468),
 ('excited', 2.3674823895889614),
 ('pilates', 2.3633297798765853),
 ('zumba', 2.349323611294693),
 ('derby', 2.3242079710579597),
 ('gone', 2.256455518902009),
 ('adorable', 2.2402424528806946)]