In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [19]:
# Read in the data
train = pd.read_csv("../otherdset/cleaned_trainlong.csv")
test = pd.read_csv("../otherdset/cleaned_testlong.csv", dtype={'label': np.int64, 'Tweet': np.string_})
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(test['Tweet']) == True))

# Remove NaNs
test.dropna(subset=['Tweet'], inplace = True)
train.dropna(subset=['Tweet'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column:', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column:', sum(pd.isnull(test['Tweet']) == True))

Number of rows in train: 949957
Number of rows in test: 60815
Number of NaNs in Tweet column:  1086
Number of NaNs in Tweet column:  53
Number of rows in train: 948871
Number of rows in test: 60762
Number of NaNs in Tweet column: 0
Number of NaNs in Tweet column: 0


# Preprocessing Data: Using tf-idf

In [24]:
# Setting up inputs
train_X = train.Tweet
train_y = train.label
test_X = test.Tweet
test_y = np.array(test.label)

In [25]:
# Since we want a vector representation of all words, we need to take both testing and training and tfidf them
combined_text = train_X.append(test_X, ignore_index=True)

In [26]:
# Initialize the tf-idf matrix from sklearn
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             max_df = 0.9) # Only consider words that appear in fewer than max_df percent of all documents          
tfidf_matrix = vectorizer.fit(combined_text) # fit tfidf to comments

In [27]:
# Transform test and train into a numerical representation of comments
train_features_X = tfidf_matrix.transform(train_X)
test_features_X = tfidf_matrix.transform(test_X)

# Fitting Logistic Regressor

In [28]:
# Initialize the model
lr = LogisticRegression(solver='liblinear')

In [29]:
# Fit the net to the training data
lr.fit(train_features_X, train_y.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
labels = lr.predict(test_features_X)
print("accuracy:", metrics.accuracy_score(test_y, labels))

accuracy: 0.7008656726243376
