In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import torch.autograd as autograd
import tqdm

from csv_handle import save_csv

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

In [3]:
source_file="data/fitocracy_posts.csv"
version = "_fito"
train_filename, valid_filename, test_filename = save_csv(source_file=source_file, version=version, full=True)

In [4]:
source_file="data/facebook_equal.csv"
version = "_fb"
train_filename_2, valid_filename_2, test_filename_2 = save_csv(source_file=source_file, version=version, full=True)

In [5]:
train = pd.read_csv('data/train_fito.csv')
valid = pd.read_csv('data/valid_fito.csv')
test = pd.read_csv('data/test_fito.csv')

In [6]:
train_2 = pd.read_csv('data/train_fb.csv')
valid_2 = pd.read_csv('data/valid_fb.csv')
test_2 = pd.read_csv('data/test_fb.csv')

In [7]:
# Build the classifier
text_train = train['comment_text'].values
text_test = test['comment_text'].values

y_train = train['gender'].values
y_test = test['gender'].values

In [8]:
# Build the classifier
text_train_2 = train_2['comment_text'].values
text_test_2 = test_2['comment_text'].values

y_train_2 = train_2['gender'].values
y_test_2 = test_2['gender'].values

In [9]:
# Vectorize sentences
vec = TfidfVectorizer(stop_words='english', max_features=2000)
X_train = vec.fit_transform(text_train)
X_test = vec.transform(text_test)

In [10]:
# Vectorize sentences
X_train_2 = vec.fit_transform(text_train_2)
X_test_2 = vec.transform(text_test_2)

In [11]:
clf = LogisticRegression(warm_start=True)
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=True)

In [28]:
saved_right = []

for x,y,text in zip(X_test, y_test, text_test):
    
    r = clf.predict(x)    
    if r == y:
        saved_right.append([x, y, text])

In [29]:
saved_right

[[<1x2000 sparse matrix of type '<class 'numpy.float64'>'
  	with 8 stored elements in Compressed Sparse Row format>,
  0,
  'Thanks for the follow! Your story/journey is awesome and I wish you all the best as you continue.'],
 [<1x2000 sparse matrix of type '<class 'numpy.float64'>'
  	with 13 stored elements in Compressed Sparse Row format>,
  0,
  "Dunno if I should stick to doing a pull-up and a chin-up each time I go through the doorway with the bar or if I should just aim for ten of each a day. I'm worried if I stick to the doorway one, I'll start avoiding that door. Thoughts?"],
 [<1x2000 sparse matrix of type '<class 'numpy.float64'>'
  	with 3 stored elements in Compressed Sparse Row format>,
  0,
  'Thanks for the follow! following back!'],
 [<1x2000 sparse matrix of type '<class 'numpy.float64'>'
  	with 2 stored elements in Compressed Sparse Row format>,
  0,
  'Thanks of the props!'],
 [<1x2000 sparse matrix of type '<class 'numpy.float64'>'
  	with 8 stored elements in Co

In [35]:
clf.fit(X_train_2, y_train_2)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=True)

In [36]:
lost = []

for x,y,text in saved_right:
    r = clf.predict(x)    
    if r != y:
        lost.append([x, y, text])

In [37]:
lost

[]

In [33]:
len(saved_right)

28253

In [34]:
len(y_train_2), len(y_train)

(48000, 191121)

In [39]:
y_test_pred_2 = clf.predict(X_test_2)
y_test_pred = clf.predict(X_test)

In [41]:
print("\n~~TEST~~")
print(classification_report(y_test, y_test_pred))

print("\n~~TEST 2~~")
print(classification_report(y_test_2, y_test_pred_2))


~~TEST~~
              precision    recall  f1-score   support

           0       0.52      0.57      0.55     29483
           1       0.47      0.42      0.45     26989

   micro avg       0.50      0.50      0.50     56472
   macro avg       0.50      0.50      0.50     56472
weighted avg       0.50      0.50      0.50     56472


~~TEST 2~~
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      7962
           1       0.81      0.80      0.80      8038

   micro avg       0.81      0.81      0.81     16000
   macro avg       0.81      0.81      0.81     16000
weighted avg       0.81      0.81      0.81     16000

