In [1]:
import re
import nltk
import json
import string
import pandas as pd
import random
import numpy as np
import joblib
import nltk.classify
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/zhaoyiting/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhaoyiting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhaoyiting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class Sentiment_Scorer:
    def __init__(self, data, model):
        # Remove Nan
        data = data.dropna(subset=['text'])
        self.data = data
        if model == None:
            raise Exception("Model has not been selected")
        if model == 'K Nearest Neighbors':
            self.model = joblib.load('joblib-KNN-Model.pkl')
        elif model == 'Decision Tree':
            self.model = joblib.load('joblib-DT-Model.pkl')
        elif model == 'Random Forest':
            self.model = joblib.load('joblib-RF-Model.pkl')
        elif model == 'Logistic Regression':
            self.model = joblib.load('joblib-LR-Model.pkl')
        elif model == 'SGD Classifier':
            self.model = joblib.load('joblib-SC-Model.pkl')
        elif model == 'Hard Voting Classifier':
            self.model = joblib.load('joblib-vh-Model.pkl')
        elif model == 'Soft Voting Classifier':
            self.model = joblib.load('joblib-vs-Model.pkl')
        else:
            raise Exception("The selected model is not currently supported by our API")
            
        self.tfidf_vectorizer = joblib.load('tfidf-vector.pkl')
        
        with open("negative.json", "r") as fp: 
            self.positive_words = json.load(fp)
        with open("positive.json", "r") as fp: 
            self.negative_words = json.load(fp)

        self.analyzer = SentimentIntensityAnalyzer()
        self.ps = PorterStemmer()

        
    def clean_text(self,text):
        def preprocess_text(text):
            text = text.lower()
            #eliminate the punctuation, URL, and @
            text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text) 
            text = re.sub(r'\d+', '', text) # Remove digits
            text = re.sub(r'[^\w\s]', '', text) # Remove special characters
            tokens = nltk.word_tokenize(text) # Tokenize the text
            return tokens
        
        def remove_stopwords(tokens):
            stop_words = set(stopwords.words('english'))
            filtered_words = [word for word in tokens if word not in stop_words]
            return filtered_words
        
        def perform_lemmatization(tokens):
            lemmatizer = nltk.WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
            return lemmatized_tokens
        
        tokens = preprocess_text(text)
        filtered_tokens = remove_stopwords(tokens)
        lemmatized_tokens = perform_lemmatization(filtered_tokens)
        clean_text = ' '.join(lemmatized_tokens)
        return clean_text
    
    
    def get_features(self, text):
        def intersection(list1,list2):
            x = set(list1)
            y = set(list2)
            z = x.intersection(y)
            return len(z)
    
        
        features = {}
        
        # Feature #1 - verbosity
        features['verbosity'] = len(text)
        
        # Feature #2 - lexical word choice
        scores = self.analyzer.polarity_scores(text)
        features['vader(pos)'] = scores['pos']
        features['vader(neg)'] = scores['neg']
        features['vader(neu)'] = scores['neu']
        features['vader(compound)'] = scores['compound']
        
        # Feature #3 - Positive and Negative Words Frequency
        words = text.split()
        words = [self.ps.stem(word) for word in words]
        pos = intersection(words,self.positive_words)
        neg = intersection(words,self.negative_words)
        features['num_pos'] = pos
        features['num_neg'] = neg
        try:
            features['tone'] = (pos-neg) / (pos+neg)
        except:
            features['tone'] = 0
    
        # Feature #4 - TFIDF
        vectors = self.tfidf_vectorizer.transform([text]).toarray()
    
        for column in range(vectors.shape[1]):
            feature_name = 'tfidf_' + str(column)
            features[feature_name] = vectors[0][column]    

        return features


    def label_dataset(self):
        ## 0 Indicating it is a Neutral Tweet/Comment
        ## 1 Indicating a Postive Sentiment
        ## -1 Indicating a Negative Tweet/Comment
        res = []
        for index, row in self.data.iterrows():
            try:
                res.append(self.model.classify(self.get_features(self.clean_text(row['text']))))
            except:
                res.append(None)
        result_df = self.data
        result_df['sentiment'] = res
    
        return result_df

In [3]:
df_predict = pd.read_csv('raw_data/UIUC_MCS_hot_posts.csv')

In [4]:
Sentiment_Score = Sentiment_Scorer(df_predict,'Soft Voting Classifier')

In [5]:
result = Sentiment_Score.label_dataset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['sentiment'] = res


In [6]:
result

Unnamed: 0.1,Unnamed: 0,created_utc,title,text,author,score,upvote_ratio,num_comments,url,sentiment
0,0,1.692469e+09,Spring 2024 Online MCS/MCS-DS Admission Thread,# Spring 2024 Online MCS/MCS-DS Admission Thre...,Massive-Oil-5897,15,1.00,67,https://www.reddit.com/r/UIUC_MCS/comments/15v...,1
1,1,1.701647e+09,Price slashed from $1120 to $950! Looking for ...,Studio Apartment; DM if interested.\n\nPrice s...,kingpin895,1,1.00,0,https://www.reddit.com/r/UIUC_MCS/comments/18a...,1
2,2,1.701543e+09,Career Outcomes for MCS program (either in-per...,I’m a potential applicant to the MCS program w...,Striking_Ad_6131,12,1.00,1,https://www.reddit.com/r/UIUC_MCS/comments/189...,1
3,3,1.701464e+09,Course selection for those just starting the U...,"Hi everyone, \n\nI've learned so much from thi...",TigerSeldon,1,0.67,10,https://www.reddit.com/r/UIUC_MCS/comments/188...,1
4,4,1.701286e+09,UIUC vs MSE-DS,I am currently pursuing my master's in data sc...,ArcticODE,4,1.00,4,https://www.reddit.com/r/UIUC_MCS/comments/186...,1
...,...,...,...,...,...,...,...,...,...,...
494,494,1.626625e+09,I'm a recent biology grad w/ a minor in CS. Is...,"Hi everyone! To give you some background, I'm ...",iamthat1dude,5,0.78,5,https://www.reddit.com/r/UIUC_MCS/comments/omt...,1
495,495,1.626558e+09,Course Review Hub Similar to GT and UT?,"Hi, I am a newly accepted UIUC Online MCS stud...",Allentownyeera,5,1.00,11,https://www.reddit.com/r/UIUC_MCS/comments/omd...,1
496,496,1.626534e+09,Help needed in creating a course structure!,I've got a recommendation from the UIUC for th...,tukaibat,4,0.84,4,https://www.reddit.com/r/UIUC_MCS/comments/om5...,1
497,497,1.626456e+09,Still “awaiting decision” for Fall ‘21,I applied last November for the on-campus MCS ...,pottersfloppy,2,1.00,9,https://www.reddit.com/r/UIUC_MCS/comments/oll...,1


In [7]:
result.groupby('sentiment').count()

Unnamed: 0_level_0,Unnamed: 0,created_utc,title,text,author,score,upvote_ratio,num_comments,url
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,8,8,8,8,8,8,8,8,8
0,51,51,51,51,48,51,51,51,51
1,401,401,401,401,392,401,401,401,401
