In [1]:
import nltk
import json
import pandas as pd
import random
import numpy as np
import joblib
import nltk.classify
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/zhaoyiting/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
class Sentiment_Scorer:
    def __init__(self, data, model):
        self.data = data
        if model == None:
            raise Exception("Model has not been selected")
        if model == 'K Nearest Neighbors':
            self.model = joblib.load('joblib-KNN-Model.pkl')
        elif model == 'Decision Tree':
            self.model = joblib.load('joblib-DT-Model.pkl')
        elif model == 'Random Forest':
            self.model = joblib.load('joblib-RF-Model.pkl')
        elif model == 'Logistic Regression':
            self.model = joblib.load('joblib-LR-Model.pkl')
        elif model == 'SGD Classifier':
            self.model = joblib.load('joblib-SC-Model.pkl')
        elif model == 'Hard Voting Classifier':
            self.model = joblib.load('joblib-vh-Model.pkl')
        elif model == 'Soft Voting Classifier':
            self.model = joblib.load('joblib-vs-Model.pkl')
        else:
            raise Exception("The selected model is not currently supported by our API")
            
        self.tfidf_vectorizer = joblib.load('tfidf-vector.pkl')
        
        with open("negative.json", "r") as fp: 
            self.positive_words = json.load(fp)
        with open("positive.json", "r") as fp: 
            self.negative_words = json.load(fp)

        self.analyzer = SentimentIntensityAnalyzer()
        self.ps = PorterStemmer()

    def get_features(self, text):
        def intersection(list1,list2):
            x = set(list1)
            y = set(list2)
            z = x.intersection(y)
            return len(z)
    
        
        features = {}
        
        # Feature #1 - verbosity
        features['verbosity'] = len(text)
        
        # Feature #2 - lexical word choice
        scores = self.analyzer.polarity_scores(text)
        features['vader(pos)'] = scores['pos']
        features['vader(neg)'] = scores['neg']
        features['vader(neu)'] = scores['neu']
        features['vader(compound)'] = scores['compound']
        
        # Feature #3 - Positive and Negative Words Frequency
        words = word_tokenize(text)
        words = [self.ps.stem(word) for word in words]
        pos = intersection(words,self.positive_words)
        neg = intersection(words,self.negative_words)
        features['num_pos'] = pos
        features['num_neg'] = neg
        try:
            features['tone'] = (pos-neg) / (pos+neg)
        except:
            features['tone'] = 0
    
        # Feature #4 - TFIDF
        vectors = self.tfidf_vectorizer.transform([text]).toarray()
    
        for column in range(vectors.shape[1]):
            feature_name = 'tfidf_' + str(column)
            features[feature_name] = vectors[0][column]    

        return features


    def label_dataset(self):
        ## 0 Indicating it is a Neutral Tweet/Comment
        ## 1 Indicating a Postive Sentiment
        ## -1 Indicating a Negative Tweet/Comment
        res = []
        for index, row in self.data.iterrows():
            try:
                res.append(self.model.classify(self.get_features(row['text'])))
            except:
                res.append(None)
        result_df = self.data
        result_df['sentiment'] = res
    
        return result_df

In [3]:
df_predict = pd.read_csv('raw_data/computerscience_hot_posts.csv')

In [4]:
Sentiment_Score = Sentiment_Scorer(df_predict,'K Nearest Neighbors')

In [5]:
result = Sentiment_Score.label_dataset()

In [6]:
result

Unnamed: 0.1,Unnamed: 0,created_utc,title,text,author,score,upvote_ratio,num_comments,url,sentiment
0,0,1.673829e+09,"Looking for books, videos, or other resources ...",,mobotsar,93,0.99,113,https://www.reddit.com/r/computerscience/comme...,
1,1,1.686431e+09,/r/ComputerScience will be going dark starting...,"## Update (June 16th, 2023):\n\nThis subreddit...",nuclear_splines,290,0.97,21,https://www.reddit.com/r/computerscience/comme...,1.0
2,2,1.686512e+09,How computers measure time,Can someone explain this to me? I've been told...,RunDiscombobulated67,86,0.98,27,https://www.reddit.com/r/computerscience/comme...,1.0
3,3,1.686514e+09,Question About Registers,Hello everyone. There is a misunderstanding I ...,mellowhorses,62,0.97,24,https://www.reddit.com/r/computerscience/comme...,1.0
4,4,1.686507e+09,Learning a new skill,"Hey guys,\n\nWanted to ask what a good compute...",Haunting_Document142,30,0.90,38,https://www.reddit.com/r/computerscience/comme...,1.0
...,...,...,...,...,...,...,...,...,...,...
95,95,1.684255e+09,Programming without a stack trace: When abstra...,This [insightful article](https://architectele...,shai-ber,43,0.93,0,https://www.reddit.com/r/computerscience/comme...,1.0
96,96,1.684255e+09,"What's the difference between clock - cycle , ...",I find these terms confusing. Please help me u...,,8,0.83,2,https://www.reddit.com/r/computerscience/comme...,1.0
97,97,1.684147e+09,Curated list of all Financial Computer Science...,"Hey guys, I am currently checking all the good...",Cruncher_ben,39,0.87,13,https://www.reddit.com/r/computerscience/comme...,1.0
98,98,1.684168e+09,Operating system (ish) question,Would it be easier to make a program that mimi...,ZookeepergameFit4103,0,0.27,11,https://www.reddit.com/r/computerscience/comme...,1.0
