## Final Version

In [None]:
pip install contractions praw

In [None]:
import json
import praw
import pandas as pd
import re
from datetime import datetime
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# NLTK setup
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import re
import unicodedata
import contractions  # You may need to install this package
import pandas as pd
# Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

# Expand Contractions
def expand_contractions(text):
    return contractions.fix(text)

# Remove Special Characters
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z\s]' if not remove_digits else r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)

# Remove Accented Characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# Full Normalization
def normalize_text(text):
    text = text.lower()
    text = remove_urls(text)
    text = expand_contractions(text)
    text = remove_accented_chars(text)
    text = remove_special_characters(text, remove_digits=True)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Text processing functions
def process_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)['compound']

In [None]:

# Reddit API setup and scraping
with open("./drive/MyDrive/BIA_Analise_de_sentimento/credentials.json", "r") as f:
    credentials = json.load(f)

client_id = credentials["CLIENT_ID"]
secret_key = credentials["SECRET_KEY"]
user_agent = "Scraper 1.0 by /u/eduardo_hbds1"
reddit = praw.Reddit(client_id=client_id, client_secret=secret_key, user_agent=user_agent)

posts = set()
for submission in reddit.subreddit('webdev').hot(limit=None):
    title_and_content = submission.title + "\n" + submission.selftext
    posts.add(title_and_content)

In [None]:
df = pd.DataFrame(posts, columns=['post'])
df.head()

In [None]:
# DataFrame creation and processing
df['normalized_post'] = df['post'].apply(normalize_text)
df['processed_post'] = df['normalized_post'].apply(process_text)
df.head()


In [None]:

# Sentiment Analysis
df['sentiment'] = df['normalized_post'].apply(analyze_sentiment)
df.to_csv('./drive/MyDrive/BIA_Analise_de_sentimento/headlinesSentiment.csv',encoding='utf-8',index=False)
df.head()


In [None]:
def classify_sentiment(score):
    if score > 0.05:  # Positive sentiment
        return 'Positive'
    elif score < -0.05:  # Negative sentiment
        return 'Negative'
    else:  # Neutral sentiment
        return 'Neutral'

df['sentiment_category'] = df['sentiment'].apply(classify_sentiment)

sentiment_counts = df['sentiment_category'].value_counts()

import matplotlib.pyplot as plt

# Bar Chart
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Number of Posts')
plt.show()


In [None]:
def sentiment_category(score):
    return 'Positive' if score > 0.05 else 'Negative' if score < -0.05 else 'Neutral'

df['sentiment_category'] = df['sentiment'].apply(sentiment_category)

from nltk.probability import FreqDist
from nltk.corpus import stopwords

# Optionally remove stop words for more meaningful results
stop_words = set(stopwords.words('english'))

def get_most_common_words(texts, num_words=10):
    words = [word for text in texts for word in text if word not in stop_words]
    freq_dist = FreqDist(words)
    return freq_dist.most_common(num_words)

# Filter the DataFrame for positive and negative texts
positive_texts = df[df['sentiment_category'] == 'Positive']['processed_post']
negative_texts = df[df['sentiment_category'] == 'Negative']['processed_post']

# Get the most common words
most_common_positive = get_most_common_words(positive_texts)
most_common_negative = get_most_common_words(negative_texts)

print("Most Common Positive Words:", most_common_positive)
print("Most Common Negative Words:", most_common_negative)


In [None]:

# Word Frequency Distribution
all_words = [word for post in df['processed_post'] for word in post]
freq_dist = FreqDist(all_words)
df.to_csv('./drive/MyDrive/BIA_Analise_de_sentimento/words.csv',encoding='utf-8',index=False)
freq_dist


In [None]:

# Display the most common words
most_common_words = freq_dist.most_common(100)
print("Most common words:")
for word, freq in most_common_words:
    print(f"{word}: {freq}")


In [None]:
freq_dist

In [None]:
# Visualization
freq_dist.plot(10, cumulative=False)
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('./drive/MyDrive/BIA_Analise_de_sentimento/headlinesSentiment.csv')
df.head()



In [None]:
df = pd.read_csv('./drive/MyDrive/BIA_Analise_de_sentimento/headlinesSentiment.csv')

# Defining a function to classify sentiment
def classify_sentiment(score):
    if score > 0.05:  # Positive sentiment
        return 'Positive'
    elif score < -0.05:  # Negative sentiment
        return 'Negative'
    else:  # Neutral sentiment
        return 'Neutral'

# Classifying sentiment for each post
df['sentiment_category'] = df['sentiment'].apply(classify_sentiment)



# Filtering posts by sentiment category
positive_posts = df[df['sentiment_category'] == 'Positive']['post']
negative_posts = df[df['sentiment_category'] == 'Negative']['post']
neutral_posts = df[df['sentiment_category'] == 'Neutral']['post']

# Displaying 5 examples of positive posts
print("Positive Posts:")
for post in positive_posts[:5]:
    print(post)
    print("#########################################################################")

# Displaying 5 examples of negative posts
print("\nNegative Posts:")
for post in negative_posts[:5]:
    print(post)
    print("#########################################################################")


# Displaying 5 examples of neutral posts
print("\nNeutral Posts:")
for post in neutral_posts[:5]:
    print(post)
    print("#########################################################################")


In [None]:
df.to_csv('./drive/MyDrive/BIA_Analise_de_sentimento/headlinesWithTagSentiment.csv',encoding='utf-8',index=False)