In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [9]:
# Read in the data

test = pd.read_csv("../otherdset/cleaned_testlong.csv")
train = pd.read_csv("../otherdset/cleaned_trainlong.csv", low_memory=False)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in comments column: ', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in comments column: ', sum(pd.isnull(test['Tweet']) == True))

# Remove NaNs
test.dropna(subset=['Tweet'], inplace = True)
train.dropna(subset=['Tweet'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in comments column:', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in comments column:', sum(pd.isnull(test['Tweet']) == True))

Number of rows in train: 949957
Number of rows in test: 60815
Number of NaNs in comments column:  1086
Number of NaNs in comments column:  53
Number of rows in train: 948871
Number of rows in test: 60762
Number of NaNs in comments column: 0
Number of NaNs in comments column: 0


Preprocessing Data: Using tf-idf

In [12]:
# Setting up inputs
train_X = train.Tweet
train_y = train.iloc[:, 1:2]
test_X = test.Tweet
test_y = np.array(test.label)

In [13]:
# Since we want a vector representation of all words, we need to take both testing and training and tfidf them
combined_text = train_X.append(test_X, ignore_index=True)

In [14]:
# Initialize the tf-idf matrix from sklearn
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                             max_df = 0.9) # Only consider words that appear in fewer than max_df percent of all documents          
tfidf_matrix = vectorizer.fit(combined_text) # fit tfidf to comments

In [15]:
# Transform test and train into a numerical representation of comments
train_features_X = tfidf_matrix.transform(train_X)
test_features_X = tfidf_matrix.transform(test_X)

Fitting RandomForestClassifer

In [16]:
# Initialize the model
clf = RandomForestClassifier()

In [None]:
# Fit the net to the training data
clf.fit(train_features_X, train_y.values.ravel())

In [None]:
# Print accuracy score
labels = clf.predict(test_features_X)
print("accuracy score:", metrics.accuracy_score(test_y, labels))