In [13]:
import nltk
import pandas as pd
import random
import numpy as np
import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.classify
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('names')
nltk.download('movie_reviews')

[nltk_data] Downloading package punkt to /Users/bhalla/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bhalla/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/bhalla/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/bhalla/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bhalla/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhalla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /Users/bhalla/nltk_data...
[nltk_data]   Package names is already up-to-

True

In [14]:
df_predict = pd.read_csv('raw_data/computerscience_hot_posts.csv')
df_train = pd.read_csv('raw_data/Reddit_Data.csv')
### There are empty rows in 'raw_data/Reddit_Data.csv'
df_train = df_train.dropna()

## Get Features 

In [15]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])
def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True
positive_words = [word for word, tag in filter(
                  skip_unwanted,
                  nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
                 )]
negative_words = [word for word, tag in filter(
                  skip_unwanted,
                  nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
                  )]

In [16]:
analyzer = SentimentIntensityAnalyzer()
ps = PorterStemmer()

def get_features(dataset, count):
    def intersection(list1,list2):
        x = set(list1)
        y = set(list2)
        z = x.intersection(y)
        return len(z)
    
    def get_simple_features(text):
#         print(text)
        features = {}
        
        # Feature #1 - verbosity
        features['verbosity'] = len(text)
        
#         # Feature #2 - lexical word choice
        scores = analyzer.polarity_scores(text)
        features['vader(pos)'] = scores['pos']
        features['vader(neg)'] = scores['neg']
        features['vader(neu)'] = scores['neu']
        features['vader(compound)'] = scores['compound']
        
#         # Feature #3 - Positive and Negative Words Frequency
        words = word_tokenize(text)
        words = [ps.stem(word) for word in words]
        pos = intersection(words,positive_words)
        neg = intersection(words,negative_words)
        features['num_pos'] = pos
        features['num_neg'] = neg
        try:
            features['tone'] = (pos-neg) / (pos+neg)
        except:
            features['tone'] = 0
        return features 
    
    
    features = [(get_simple_features(row['clean_comment']), row['category']) 
                    for index, row in dataset.iterrows()]
    
    # Feature #4 - TFIDF
    train_data, test_data = dataset[:count], dataset[count:]
    tfidf_vectorizer = TfidfVectorizer(max_features = 10,
                                       stop_words='english',
                                       use_idf=True, 
                                       norm='l2', 
                                       smooth_idf=True) 
    tfidf_train_vectors = tfidf_vectorizer.fit_transform(train_data['clean_comment']).toarray()
    tfidf_test_vectors = tfidf_vectorizer.transform(test_data['clean_comment']).toarray()
    tfidf_vectors = np.vstack([tfidf_train_vectors,tfidf_test_vectors])
    
    for idx in range(len(features)):
        row = features[idx]
        for column in range(tfidf_vectors.shape[1]):
            feature_name = 'tfidf_' + str(column)
            row[0][feature_name] = tfidf_vectors[idx][column]
        features[idx] = row    
    

    return features

## Training

In [17]:
def data_train_test_split(dataset, split_percentage=0.9):
    dataset = dataset.sample(frac = 1)
    count = int(len(dataset) * split_percentage)
    
    feature_set = get_features(dataset, count)
    
    train_set, test_set = feature_set[:count], feature_set[count:]
    return train_set, test_set

In [18]:
def train_model(dataset, model):
    nltk_model = SklearnClassifier(model)
    nltk_model.train(dataset)
    
    return nltk_model

In [19]:
def label_dataset(dataset, model):
    ## 0 Indicating it is a Neutral Tweet/Comment
    ## 1 Indicating a Postive Sentiment
    ## -1 Indicating a Negative Tweet/Comment
    
    res = []
    
    for index, row in dataset.iterrows():
        try:
            res.append(model.classify(get_features(row['text'])))
        except:
            res.append(None)
            
    result_df = dataset
    result_df['sentiment'] = res
    
    return result_df

In [20]:
train_set, test_set = data_train_test_split(df_train)

In [21]:
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier',
         #'Support Vector Classifier'
        ]

random_state = 1234

classifiers = [
    KNeighborsClassifier(weights='distance', n_neighbors=60, p=1),
    DecisionTreeClassifier(min_samples_split=100, min_samples_leaf=35, max_depth=8),
    RandomForestClassifier(min_samples_split=100, min_samples_leaf=50, n_estimators=100, max_depth=20, random_state=random_state),
    LogisticRegression(),
    CalibratedClassifierCV(SGDClassifier(max_iter=100)),
    # SVC(kernel='linear')
]

models = zip(names, classifiers)

df_result = {}

for name, model in models:
    classfier = train_model(train_set, model)
    accuracy = nltk.classify.accuracy(classfier, test_set)
    result = label_dataset(df_predict, classfier)
    df_result[name] = result
    print("{} model Accuracy: {}".format(name, accuracy))

K Nearest Neighbors model Accuracy: 0.6314939434724092
Decision Tree model Accuracy: 0.6742934051144011
Random Forest model Accuracy: 0.6877523553162853


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression model Accuracy: 0.6648721399730821
SGD Classifier model Accuracy: 0.642799461641992


## Ensemble: Voting Classifier 

In [22]:
models = list(zip(names, classifiers))

nltk_ensemble_hard = SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble_hard.train(train_set)
accuracy = nltk.classify.accuracy(nltk_ensemble_hard, test_set)
print("Voting Classifier (hard) model Accuracy: {}".format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting Classifier (hard) model Accuracy: 0.6767160161507403


In [23]:
result = label_dataset(df_predict, nltk_ensemble_hard)
df_result['Voting Hard'] = result

In [24]:
models = list(zip(names, classifiers))

nltk_ensemble_soft = SklearnClassifier(VotingClassifier(estimators=models, voting='soft', n_jobs=-1))
nltk_ensemble_soft.train(train_set)
accuracy = nltk.classify.accuracy(nltk_ensemble_soft, test_set)
print("Voting Classifier (soft) model Accuracy: {}".format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting Classifier (soft) model Accuracy: 0.677792732166891


In [25]:
result = label_dataset(df_predict, nltk_ensemble_soft)
df_result['Voting Soft'] = result