In [1]:
import pandas as pd
import datetime as dt 

# Visulization libraries
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns

# Transformer model
from transformers import pipeline

In [2]:
posts_df = pd.read_csv('DS_ML_AI_posts.csv')
comments_df = pd.read_csv('DS_ML_AI_comments.csv')

In [3]:
posts_df['created_date'] = posts_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))

posts_df['created_year'] = posts_df['created_date'].dt.year
posts_df

In [4]:
comments_df

In [5]:
# Merge posts wiith their comments
comments_posts_df = posts_df.merge(comments_df, on='post_id', how='left')

comments_posts_df

In [6]:
# Remove rows with missing comments
comments_posts_df = comments_posts_df[~comments_posts_df['comment'].isnull()]

### EDA

In [7]:
posts_df

### Wordcloud post titles

In [8]:
post_title_text = ''.join([title for title in posts_df['post_title'].str.lower()])

word_cloud = WordCloud(
    collocation_threshold=2,
    width=1000,
    height=500,
    background_color='white'
).generate(post_title_text)

# Display the generated Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

### Wordcloud post titles by year

In [9]:
selected_year = 2023

In [10]:
posts_in_year = posts_df[posts_df['created_year'] == selected_year]
post_title_text_year = ''.join(item for item in posts_in_year[~posts_in_year['post_title'].isna()]['post_title'])

word_cloud = WordCloud(collocation_threshold=2, width=1000, height=500, background_color='white').generate(post_title_text_year)

plt.figure(figsize=(10, 5))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

### Sentiment analysis

In [11]:
sentiment_classifier = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

In [12]:
sentiment_classifier("I don't love you")

In [13]:
input_word = "value"

In [14]:
def get_sentiment(text):
    # Get sentiment prediction scores
    try:
        sentiment = sentiment_classifier(text)[0]['label']
    except:
        sentiment = 'Not classified'
        
    return sentiment

In [15]:
comments_posts_df_sub = comments_posts_df[comments_posts_df['post_title'].str.contains(input_word)]
comments_posts_df_sub

In [16]:
comments_posts_df_sub['sentiment'] = comments_posts_df_sub['comment'].astype(str).apply(lambda x: get_sentiment(x))
comments_posts_df_sub

In [17]:
from lets_plot import * 
from lets_plot.mapping import * 

ggplot(comments_posts_df_sub) + geom_pie(aes(fill=as_discrete('sentiment', order_by='..count..')),
                                         size=30, hole=0.2, stroke=1.0,
                                         labels=layer_labels()
                                         .line('@sentiment')
                                         .line('(@{..prop..})')
                                         .format('..prop..', '.0%')
                                         ) \
                                + theme(line=element_blank(), axis_text=element_blank(),
                                        axis_title=element_blank(), legend_position='none') \
                                + ggtitle('Sentiment of around the topic')

### Emotion recognition

In [18]:
emotion_classifier = pipeline('text-classification', model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

In [19]:
emotion_classifier('Icecream is delicious')

In [20]:
def get_emotion(text):
    # Get emotion prediction scores
    pred_scores = emotion_classifier(text)

    # Get emotion with highest prediction score
    emotion = max(pred_scores[0], key=lambda  x: x['score'])['label']
    return emotion

In [21]:
comments_posts_df_sub['emotion'] = comments_posts_df_sub['comment'].astype(str).apply(lambda x: get_emotion(x))


comments_posts_df_sub

In [3]:
from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from langchain import OpenAI 
import os
from IPython.display import Markdown, display

In [None]:
# Combine all posts and comments and save to a text file
comments_posts_df_tmp = comments_posts_df[['post_title', 'selftext', 'comment']].astype(str)
agg_comments = comments_posts_df_tmp.groupby(['post_title', 'selftext'])['comment'].apply('. '.join).reset_index()
agg_comments

In [None]:
agg_comments['combined_text'] = agg_comments.astype(str).agg('. '.join, axis=1)
all_text = ' '.join(agg_comments['combined_text'])

In [None]:
def construct_index(directory_path):
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 256
    # set maximum chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600

    # define LLM (ChatGPT gpt-3.5-turbo)
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=num_outputs))
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
 
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    index = GPTSimpleVectorIndex(
        documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    )

    index.save_to_disk('index.json')

    return index


def ask_me_anything(question):

    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    response = index.query(question, response_mode="compact")

    display(Markdown(f"You asked: <b>{question}</b>"))
    display(Markdown(f"Bot says: <b>{response.response}</b>"))

In [None]:
openapi_key = "value"

In [None]:
# Set OpenAI key
os.environ["OPENAI_API_KEY"] = openai_key

In [None]:
# Construct our index (ONLY NEED TO RUN ONCE! BE CAREFUL THAT THIS COSTS MONEY)
# This will take every file in folder, split it into chunks, and embed it with OpenAI's embeddings API. 
construct_index('/data/notebook_files/textdata')

In [None]:
question = "value"

In [None]:
ask_me_anything(question)