# Analyzing the scraped data with pandas and mlflow

In [4]:
# MLflow
import mlflow
from mlflow.models import infer_signature

# MLflow model
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Visualizations
import matplotlib.pyplot as plt

# math functions
import numpy as np

# read CSV file
import os
import glob

# data frames
import pandas as pd

# regexes
import re

In [5]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

# Load Libraries

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain

# reddit crawler
import praw

# sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 8) # default plot size
import seaborn as sns
sns.set(style='whitegrid', palette='Dark2')
from wordcloud import WordCloud

# Downloading NLTK's databases
nltk.download('vader_lexicon') # get lexicons data
nltk.download('punkt') # for tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load dataset in pandas

In [18]:
# future warnings
import warnings
warnings.filterwarnings("ignore")

topics_dict = { "date":[],
                "author":[],
                "title":[],
                "ups":[],
                "downs":[],
                "score":[],
                "id":[],
                "url":[],
                "comms_num": [],
                "created": [],
                "body":[]}
# Use the Reddit dataset
"""
['date','author','title','ups','downs','score','id','url','comms_num','created','body']
"""
df = pd.DataFrame()
running_total = 0
for fname in glob.glob(os.path.abspath('./data/**/*.csv')):
    _df=pd.read_csv(fname)
    _df['query'] = os.path.splitext(os.path.basename(fname))[0]
    _df['subreddit'] = os.path.basename(os.path.dirname(fname))
    df = df.append(_df.copy(), ignore_index=True)
    running_total+=len(_df)
    print(fname)
    print(running_total)
    #break #DEBUG

# setup the created datetime
df['created'] = pd.to_datetime(df['created'], unit='s')

## Run sentiment analysis

In [7]:
# Clean data

#Create a function to clean the tweets
def cleanTxt(text):
    text = re.sub(r'@[A-Za-z0–9]+', '', text) #Remove @mentions replace with blank
    text = re.sub(r'#', '', text) #Remove the ‘#’ symbol, replace with blank
    text = re.sub(r'RT[\s]+', '', text) #Removing RT, replace with blank
    text = re.sub(r'https?:\/\/\S+', '', text) #Remove the hyperlinks
    text = re.sub(r':', '', text) # Remove :
    return text

#Next we have to remove emoji & Unicode from the Tweet data.
def remove_emoji(string):
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F" # emoticons
    u"\U0001F300-\U0001F5FF" # symbols & pictographs
    u"\U0001F680-\U0001F6FF" # transport & map symbols
    u"\U0001F1E0-\U0001F1FF" # flags (iOS)
    u"\U00002500-\U00002BEF" # chinese char
    u"\U00002702-\U000027B0"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f" # dingbats
    u"\u3030"
    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [8]:
# remove rows without a comment body
df.dropna(subset='body', how="any", inplace=True)

df['clean'] = df['body'].apply(lambda x: remove_emoji(cleanTxt(x)))

In [15]:
# VADER sentiment analysis
sid = SentimentIntensityAnalyzer()

df[['pos', 'neg', 'neu', 'compound']] = df['clean'].apply(lambda text: pd.Series(sid.polarity_scores(text)))

# Threshold conditions determine the value of the sentiment of the text
THRESHOLD = 0.2
conditions = [
    (df['compound'] <= -THRESHOLD),
    (df['compound'] > -THRESHOLD) & (df['compound'] < THRESHOLD),
    (df['compound'] >= THRESHOLD),
    ]
values = ["neg", "neu", "pos"]
df['label'] = np.select(conditions, values)

df.head()

# Convert all sentiment columns to numeric type
df[['pos', 'neg', 'neu', 'compound']] = df[['pos', 'neg', 'neu', 'compound']].apply(pd.to_numeric, errors='coerce')

## Filtering dataset from November 2022 to January 2023

In [20]:
# converting created dates from reddit API into human readable format
from datetime import datetime

# Define the date range
start_date = datetime(2022, 11, 1)
end_date = datetime(2023, 1, 31)

df[(df['created'] >= start_date) & (df['created'] <= end_date)]

Unnamed: 0.1,Unnamed: 0,date,author,title,ups,downs,score,id,url,comms_num,created,body,query,subreddit,clean,pos,neg,neu,compound,label
40,40,2022-12-17 13:18:18,yfzi,ChatGPT AI just solved an unsolved math proble...,0,0,0,zo64dm,https://www.reddit.com/r/artificial/comments/z...,7,2022-12-17 13:18:18,I first asked the chatbot (**ChatGPT** by Open...,gpt-3.0,artificial,I first asked the chatbot (**ChatGPT** by Open...,0.020,0.825,0.156,0.9981,pos
86,86,2022-12-17 13:18:18,yfzi,ChatGPT AI just solved an unsolved math proble...,0,0,0,zo64dm,https://www.reddit.com/r/artificial/comments/z...,7,2022-12-17 13:18:18,I first asked the chatbot (**ChatGPT** by Open...,gpt-3.0,artificial,I first asked the chatbot (**ChatGPT** by Open...,0.020,0.825,0.156,0.9981,pos
131,40,2022-12-17 13:18:18,yfzi,ChatGPT AI just solved an unsolved math proble...,0,0,0,zo64dm,https://www.reddit.com/r/artificial/comments/z...,7,2022-12-17 13:18:18,I first asked the chatbot (**ChatGPT** by Open...,gpt-4.0,artificial,I first asked the chatbot (**ChatGPT** by Open...,0.020,0.825,0.156,0.9981,pos
175,40,2022-12-17 13:18:18,yfzi,ChatGPT AI just solved an unsolved math proble...,0,0,0,zo64dm,https://www.reddit.com/r/artificial/comments/z...,7,2022-12-17 13:18:18,I first asked the chatbot (**ChatGPT** by Open...,gpt,artificial,I first asked the chatbot (**ChatGPT** by Open...,0.020,0.825,0.156,0.9981,pos
221,86,2022-12-17 13:18:18,yfzi,ChatGPT AI just solved an unsolved math proble...,0,0,0,zo64dm,https://www.reddit.com/r/artificial/comments/z...,7,2022-12-17 13:18:18,I first asked the chatbot (**ChatGPT** by Open...,gpt,artificial,I first asked the chatbot (**ChatGPT** by Open...,0.020,0.825,0.156,0.9981,pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54743,775,2022-12-20 22:54:48,Singularian2501,[R] Nonparametric Masked Language Modeling - M...,271,0,271,zr2en7,https://www.reddit.com/r/MachineLearning/comme...,31,2022-12-20 22:54:48,Paper: [https://arxiv.org/abs/2212.01349](http...,openai,machinelearning,Paper [\n\nGithub [\n\nAbstract\n\n>Existing l...,0.022,0.954,0.024,0.0516,neu
54769,801,2023-01-20 10:41:04,ChubChubkitty,[N] OpenAI Used Kenyan Workers on Less Than $2...,524,0,524,10gtruu,https://www.reddit.com/r/MachineLearning/comme...,246,2023-01-20 10:41:04,https://time.com/6247678/openai-chatgpt-kenya-...,openai,machinelearning,,0.000,0.000,0.000,0.0000,neu
54777,809,2022-11-03 23:12:45,TiredOldCrow,"[D] DALL·E to be made available as API, OpenAI...",419,0,419,yli0r7,https://www.reddit.com/r/MachineLearning/comme...,55,2022-11-03 23:12:45,Email announcement from OpenAI below:\n\n\n> D...,openai,machinelearning,Email announcement from OpenAI below\n\n\n> DA...,0.016,0.799,0.186,0.9852,pos
54793,825,2022-12-22 18:39:30,_underlines_,[D] When chatGPT stops being free: Run SOTA LL...,349,0,349,zstequ,https://www.reddit.com/r/MachineLearning/comme...,95,2022-12-22 18:39:30,Edit: Found [LAION-AI/OPEN-ASSISTANT](https://...,openai,machinelearning,Edit Found [LAION-AI/OPEN-ASSISTANT]( a very p...,0.074,0.822,0.104,0.9386,pos
