In [10]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

In [11]:
# Read in the data
test = pd.read_csv("../maindatasets/cleaned_train.csv")
train = pd.read_csv("../otherdset/cleaned_testlong.csv", dtype={'label': np.int64, 'Tweet': np.string_})
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(test['Tweet']) == True))

# Remove NaNs
test.dropna(subset=['Tweet'], inplace = True)
train.dropna(subset=['Tweet'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column:', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column:', sum(pd.isnull(test['Tweet']) == True))

Number of rows in train: 60815
Number of rows in test: 39780
Number of NaNs in Tweet column:  53
Number of NaNs in Tweet column:  0
Number of rows in train: 60762
Number of rows in test: 39780
Number of NaNs in Tweet column: 0
Number of NaNs in Tweet column: 0


Preprocessing Data: Using tf-idf

In [12]:
# Setting up inputs
train_X = train.Tweet
train_y = train.iloc[:, 1:2]
test_X = test.Tweet
test_y = np.array(test.Sarcastic)

In [13]:
# Since we want a vector representation of all words, we need to take both testing and training and tfidf them
combined_text = train_X.append(test_X, ignore_index=True)

In [14]:
# Initialize the tf-idf matrix from sklearn
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             max_df = 0.9) # Only consider words that appear in fewer than max_df percent of all documents          
tfidf_matrix = vectorizer.fit(combined_text) # fit tfidf to comments

In [15]:
# Transform test and train into a numerical representation of comments
train_features_X = tfidf_matrix.transform(train_X)
test_features_X = tfidf_matrix.transform(test_X)

Fitting RandomForestClassifer

In [16]:
# Initialize the model
clf = RandomForestClassifier()

In [17]:
# Fit the net to the training data
clf.fit(train_features_X, train_y.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
labels = clf.predict(test_features_X)
print("accuracy:", metrics.accuracy_score(test_y, labels))

accuracy: 0.5219708396178985
