## Sentiment Analysis for Reddit Data

In [27]:
import praw
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import pandas as pd
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [28]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 1.	Data Collection

In [29]:
reddit = praw.Reddit(client_id='_DImiptAS4SzysaUp5ZhUQ',
                     client_secret='D4PNJf-0_ddWTJtmL9egt-Gie48pzQ',
                     user_agent='Responsible-Art5268')


In [30]:
headlines = set()

### Get the headlines

In [31]:
# Different sorts and time filters
sort_methods = ['new', 'hot', 'top', 'controversial']
time_filters = ['day', 'week', 'month', 'year', 'all']

# List of related subreddits
related_subreddits = ['economy', 'finance', 'business', 'investing']

# Function to fetch headlines from a given subreddit
def fetch_headlines_from_subreddit(subreddit_name):
    subreddit = reddit.subreddit(subreddit_name)
    for sort_method in sort_methods:
        if sort_method in ['top', 'controversial']:
            for time_filter in time_filters:
                fetch_headlines(subreddit, sort_method, time_filter)
                if len(headlines) >= 10000:
                    return
        else:
            fetch_headlines(subreddit, sort_method)
            if len(headlines) >= 10000:
                return

# Function to fetch headlines using a specific sort and time filter
def fetch_headlines(subreddit, sort_method, time_filter=None):
    if time_filter:
        submissions = getattr(subreddit, sort_method)(time_filter=time_filter, limit=None)
    else:
        submissions = getattr(subreddit, sort_method)(limit=None)

    for submission in submissions:
        headlines.add(submission.title)
        if len(headlines) >= 10000:
            break

# Iterate and collect headlines from each subreddit
for subreddit_name in related_subreddits:
    fetch_headlines_from_subreddit(subreddit_name)
    if len(headlines) >= 10000:
        break

print(f"Total headlines collected: {len(headlines)}")

Total headlines collected: 10000


### Converting to lowercase

In [32]:
#Converting to lowercase
lowercaseHeadlines = set()
for item in headlines:
    lowercaseHeadlines.add(item.lower())

### Removing the special characters

In [33]:
import re
import pandas as pd

# Function to remove special characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Clean the headlines and create a new set
cleaned_headlines = {remove_special_characters(item) for item in lowercaseHeadlines}

# Counting the number of headlines with special characters
count_special = sum(bool(re.search(r'[^a-zA-Z0-9\s]', item)) for item in lowercaseHeadlines)

print(f"Number of items with special characters: {count_special}")
print(cleaned_headlines)


Number of items with special characters: 8060


### Annotate the dataset

In [34]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

annotated_headlines = []
for headline in cleaned_headlines:
    score = sia.polarity_scores(headline)
    compound = score['compound']
    
    if compound >= 0.05:
        sentiment = 'positive'
    elif compound <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    annotated_headlines.append((headline, sentiment))

# Example: print first few annotated headlines
print(annotated_headlines[:5])

[('', 'neutral'), ('wtf indian  americans making almost double than avg   white americans ', 'negative'), ('hasbro plans to cut 1100 additional jobs', 'negative'), ('musk defies skeptics meets tesla delivery goal shares hit record', 'positive'), ('harvard announces taylor swift 101 course in an attempt to pivot from antisemitism headlines were paying 79450 annually for this ', 'positive')]


### Convert to dataframe

In [35]:
headline_df = pd.DataFrame.from_records(annotated_headlines, columns=['Headline', 'Label'])
headline_df.head()

Unnamed: 0,Headline,Label
0,,neutral
1,wtf indian americans making almost double tha...,negative
2,hasbro plans to cut 1100 additional jobs,negative
3,musk defies skeptics meets tesla delivery goal...,positive
4,harvard announces taylor swift 101 course in a...,positive


### Save to csv

In [36]:
# Save the DataFrame to a CSV file
csv_file_path = 'annotated_headlines.csv'  
headline_df.to_csv(csv_file_path, index=False)

### Remove any null headlines present after the preprocessing

In [37]:
finalHeadline_df = headline_df[headline_df['Headline'].notna() & headline_df['Headline'].str.strip().astype(bool)]
finalHeadline_df.head()

Unnamed: 0,Headline,Label
1,wtf indian americans making almost double tha...,negative
2,hasbro plans to cut 1100 additional jobs,negative
3,musk defies skeptics meets tesla delivery goal...,positive
4,harvard announces taylor swift 101 course in a...,positive
5,200000 users abandon netflix after crackdown b...,negative


In [38]:
headline_df.shape

(9987, 2)

# 2.	Preprocessing 

In [39]:
def preprocess_text(text):
    # Tokenizing the words
    tokens = word_tokenize(text)

    # Removing non-alpha characters
    tokens = [word for word in tokens if word.isalpha()] 

    # Removing stopwords
    stop_words = set(stopwords.words('english'))    
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming and Lemmatizing the words
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens] 

    return ' '.join(tokens)

# Apply the preprocess_text function using .loc
finalHeadline_df.loc[:, 'Processed_Headline'] = finalHeadline_df['Headline'].apply(preprocess_text)
finalHeadline_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finalHeadline_df.loc[:, 'Processed_Headline'] = finalHeadline_df['Headline'].apply(preprocess_text)


Unnamed: 0,Headline,Label,Processed_Headline
1,wtf indian americans making almost double tha...,negative,wtf indian american make almost doubl avg whit...
2,hasbro plans to cut 1100 additional jobs,negative,hasbro plan cut addit job
3,musk defies skeptics meets tesla delivery goal...,positive,musk defi skeptic meet tesla deliveri goal sha...
4,harvard announces taylor swift 101 course in a...,positive,harvard announc taylor swift cours attempt piv...
5,200000 users abandon netflix after crackdown b...,negative,user abandon netflix crackdown backfir


# 3.	Feature Extraction 

### TF-IDF

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(finalHeadline_df['Processed_Headline'])

In [41]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)

scaler_tfidf = StandardScaler()
X_train_tfidf_scaled = scaler_tfidf.fit_transform(X_train_tfidf.toarray()) 
X_test_tfidf_scaled = scaler_tfidf.transform(X_test_tfidf.toarray())  
classifier_tfidf = LogisticRegression(max_iter=1000)
classifier_tfidf.fit(X_train_tfidf_scaled, y_train_tfidf)

# Make predictions
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf_scaled)

# Evaluate the classifier
print("TF-IDF Performance (Scaled):")
print(classification_report(y_test_tfidf, y_pred_tfidf))

TF-IDF Performance (Scaled):
              precision    recall  f1-score   support

    negative       0.67      0.68      0.68       697
     neutral       0.62      0.68      0.65       670
    positive       0.69      0.61      0.65       631

    accuracy                           0.66      1998
   macro avg       0.66      0.66      0.66      1998
weighted avg       0.66      0.66      0.66      1998



### Word2Vec

In [42]:
import gensim

# Tokenized documents
tokenized_docs = [doc.split() for doc in finalHeadline_df['Processed_Headline']]

# Train a Word2Vec model
word2vec_model = gensim.models.Word2Vec(tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Function to create document vectors
def document_vector_word2vec(doc):
    words = doc.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector if no words are found
    else:
        return np.mean(word_vectors, axis=0)

word2vec_features = np.array([document_vector_word2vec(doc) for doc in finalHeadline_df['Processed_Headline']])


In [43]:
# Word2Vec
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)
classifier_word2vec = LogisticRegression()
classifier_word2vec.fit(X_train_word2vec, y_train_word2vec)
y_pred_word2vec = classifier_word2vec.predict(X_test_word2vec)

# Evaluate the classifier
print("Word2Vec Performance:")
print(classification_report(y_test_word2vec, y_pred_word2vec))

Word2Vec Performance:
              precision    recall  f1-score   support

    negative       0.41      0.68      0.51       697
     neutral       0.42      0.47      0.44       670
    positive       0.40      0.06      0.10       631

    accuracy                           0.41      1998
   macro avg       0.41      0.40      0.35      1998
weighted avg       0.41      0.41      0.36      1998



### GloVe

In [44]:
import numpy as np
import gensim.downloader as api

# Load pre-trained GloVe model (choose an appropriate model)
glove_model = api.load("glove-wiki-gigaword-100")  # For example

# Function to create document vectors
def document_vector_glove(doc):
    words = doc.split()
    word_vectors = [glove_model[word] for word in words if word in glove_model]
    if len(word_vectors) == 0:
        return np.zeros(glove_model.vector_size)  # Return a zero vector if no words are found
    else:
        return np.mean(word_vectors, axis=0)

glove_features = np.array([document_vector_glove(doc) for doc in finalHeadline_df['Processed_Headline']])

In [45]:
# GloVe
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(glove_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_glove_scaled = scaler.fit_transform(X_train_glove)
X_test_glove_scaled = scaler.transform(X_test_glove)

classifier_glove = LogisticRegression(max_iter=1000)
classifier_glove.fit(X_train_glove_scaled, y_train_glove)
y_pred_glove = classifier_glove.predict(X_test_glove)

# Evaluate the classifier
print("GloVe Performance:")
print(classification_report(y_test_glove, y_pred_glove))

GloVe Performance:
              precision    recall  f1-score   support

    negative       0.55      0.77      0.64       697
     neutral       0.61      0.28      0.38       670
    positive       0.52      0.58      0.55       631

    accuracy                           0.55      1998
   macro avg       0.56      0.55      0.53      1998
weighted avg       0.56      0.55      0.53      1998



### Evaluating which Feature Extraction performs better among the three 
1. **TF-IDF Performance:**
    - Accuracy: 67% 
    - Precision, Recall, F1-Score: The three sentiment classifications are very equally distributed.
    - Observation: TF-IDF appears to function fairly well, indicating that the frequency and distinctiveness of the context in the headlines serve as reliable markers of mood. Recall and precision seem to be fairly balanced with this strategy.
    <br><br>

2. **Word2Vec Performance:**
    - Accuracy: 43%
    - Precision, Recall, and F1-Score: Much worse results, particularly with the positive class that has essentially no recall.
    - Observation: The low performance could be attributed to the Word2Vec model's inability to adequately capture semantic relationships in the data, or to the peculiar vocabulary prevalent in Reddit headlines that isn't adequately represented in the Word2Vec model's training corpus.
    <br><br>
3. **GloVe Performance:**
    - Accuracy: 55%
    - Precision, Recall, and F1-Score: Not as good as TF-IDF, but still superior to Word2Vec. In contrast, GloVe has a poorer recall but a comparatively higher precision.
    - Observation: GloVe's performance indicates that although it is superior to Word2Vec in capturing semantic associations, it is not as good as TF-IDF in providing context-specific understanding for this specific dataset.

### Insights about the dataset based on the feature extraction techniques:
- **Best Overall Method:** In terms of overall accuracy and balance between precision, recall, and F1-score, TF-IDF performs better than GloVe and Word2Vec. This suggests that, for the data collected, Word2Vec or GloVe's semantic word associations are not as predictive of sentiment as TF-IDF's capture of the value of individual terms.
- **Dataset Specificity:** Word2Vec and GloVe's performance indicates that the collected data may contain unique terminology or certain contextual nuances that these algorithms are not completely capturing. This may occur in datasets that contain slang, specialised terminology, or inventive language use.

In [46]:
finalHeadline_df

Unnamed: 0,Headline,Label,Processed_Headline
1,wtf indian americans making almost double tha...,negative,wtf indian american make almost doubl avg whit...
2,hasbro plans to cut 1100 additional jobs,negative,hasbro plan cut addit job
3,musk defies skeptics meets tesla delivery goal...,positive,musk defi skeptic meet tesla deliveri goal sha...
4,harvard announces taylor swift 101 course in a...,positive,harvard announc taylor swift cours attempt piv...
5,200000 users abandon netflix after crackdown b...,negative,user abandon netflix crackdown backfir
...,...,...,...
9982,google to purge billions of files containing p...,neutral,googl purg billion file contain person data se...
9983,corporate bankruptcies in japan highest in fiv...,neutral,corpor bankruptci japan highest five year wake...
9984,house set to pass marijuana legalization friday,neutral,hous set pas marijuana legal friday
9985,is the us getting too expensive to live in man...,negative,u get expens live mani american worri economi ...


# 4.	Model Selection and Training 

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = nb_classifier.predict(X_test)
print("Naive Bayes Classifier Performance:")
print(classification_report(y_test, y_pred))


Naive Bayes Classifier Performance:
              precision    recall  f1-score   support

    negative       0.62      0.84      0.71       697
     neutral       0.73      0.58      0.64       670
    positive       0.73      0.61      0.66       631

    accuracy                           0.68      1998
   macro avg       0.69      0.67      0.67      1998
weighted avg       0.69      0.68      0.67      1998



In [48]:
from sklearn.svm import SVC

# Initialize and train the SVM Classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred_svm = svm_classifier.predict(X_test)
print("SVM Classifier Performance:")
print(classification_report(y_test, y_pred_svm))


SVM Classifier Performance:
              precision    recall  f1-score   support

    negative       0.77      0.77      0.77       697
     neutral       0.72      0.86      0.78       670
    positive       0.83      0.66      0.74       631

    accuracy                           0.77      1998
   macro avg       0.77      0.76      0.76      1998
weighted avg       0.77      0.77      0.77      1998



In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Convert labels to categorical
y_train_cat = to_categorical(y_train.factorize()[0])
y_test_cat = to_categorical(y_test.factorize()[0])

# Neural Network Model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(256, activation='relu'))
model.add(Dense(y_train_cat.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train.toarray(), y_train_cat, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test_cat)
print("Neural Network Performance: Accuracy = {:.2f}".format(accuracy))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Performance: Accuracy = 0.73


In [50]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test_cat, axis=1)

report = classification_report(y_true, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.74      0.74       697
           1       0.71      0.78      0.75       670
           2       0.75      0.67      0.71       631

    accuracy                           0.73      1998
   macro avg       0.73      0.73      0.73      1998
weighted avg       0.73      0.73      0.73      1998



# 5.	Deployment and Interface

In [51]:
import tkinter as tk
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Function to preprocess and predict sentiment
def predict_sentiment():
    user_input = text_input.get("1.0", "end-1c")  
    processed_input = preprocess_text(user_input)  
    vectorized_input = tfidf_vectorizer.transform([processed_input])  
    prediction = svm_classifier.predict(vectorized_input)  
    result_label.config(text="Predicted Sentiment: " + str(prediction[0]))  

# Tkinter window
root = tk.Tk()
root.title("Social Media Sentiment Analysis")

# Text input widget
text_input = tk.Text(root, height=5, width=40)
text_input.pack()

# Predict button
predict_button = tk.Button(root, text="Predict Sentiment", command=predict_sentiment)
predict_button.pack()

# Label to display the result
result_label = tk.Label(root, text="Predicted Sentiment: ")
result_label.pack()

# Run the application
root.mainloop()
