In [2]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [3]:
# Read in the data

test = pd.read_csv("../otherdset/data_test_sarcasm.csv")
train = pd.read_csv("../otherdset/data_train_sarcasm.csv", low_memory=False)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in comments column: ', sum(pd.isnull(train['comment']) == True))
print('Number of NaNs in comments column: ', sum(pd.isnull(test['comment']) == True))

# Remove NaNs
test.dropna(subset=['comment'], inplace = True)
train.dropna(subset=['comment'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in comments column:', sum(pd.isnull(train['comment']) == True))
print('Number of NaNs in comments column:', sum(pd.isnull(test['comment']) == True))

Number of rows in train: 950000
Number of rows in test: 60826
Number of NaNs in comments column:  43
Number of NaNs in comments column:  11
Number of rows in train: 949957
Number of rows in test: 60815
Number of NaNs in comments column: 0
Number of NaNs in comments column: 0


In [19]:
train[train.comment.apply(len)>15]

Unnamed: 0,label,comment
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.
5,0,"I don't pay attention to her, but as long as s..."
6,0,Trick or treating in general is just weird...
7,0,Blade Mastery+Masamune or GTFO!
8,0,"You don't have to, you have a good build, buy ..."
9,0,I would love to see him at lolla.
10,0,I think a significant amount would be against ...


Preprocessing Data: Using tf-idf

In [6]:
# Setting up inputs
train_X = train.comment
train_y = train.iloc[:, 0:1]
test_X = test.comment
test_y = np.array(test.label)

In [7]:
# Since we want a vector representation of all words, we need to take both testing and training and tfidf them
combined_text = train_X.append(test_X, ignore_index=True)

In [8]:
# Initialize the tf-idf matrix from sklearn
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                             max_df = 0.9) # Only consider words that appear in fewer than max_df percent of all documents          
tfidf_matrix = vectorizer.fit(combined_text) # fit tfidf to comments

In [9]:
# Transform test and train into a numerical representation of comments
train_features_X = tfidf_matrix.transform(train_X)
test_features_X = tfidf_matrix.transform(test_X)

Fitting RandomForestClassifer

In [10]:
# Initialize the model
clf = RandomForestClassifier()

In [None]:
# Fit the net to the training data
clf.fit(train_features_X, train_y.values.ravel())

In [None]:
# Print accuracy score
labels = clf.predict(test_features_X)
print("accuracy score:", metrics.accuracy_score(test_y, labels))