In [2]:
import praw
import os
import re
import pandas as pd
from dotenv import load_dotenv
from textblob import TextBlob
load_dotenv()
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")
user_agent = os.getenv("USER_AGENT")
reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)

print("Authenticated with Reddit API successfully!")
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  
    text = text.lower()  
    return text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  
subreddit = reddit.subreddit("stocks")
posts = []
for post in subreddit.top(limit=100):
    post_data = {
        'title': post.title,
        'score': post.score,
        'url': post.url,
        'content': post.selftext,
        'title_sentiment': get_sentiment(post.title),
        'content_sentiment': get_sentiment(post.selftext),
    }
    post_data['title'] = clean_text(post_data['title'])
    post_data['content'] = clean_text(post_data['content'])
    posts.append(post_data)
df = pd.DataFrame(posts)
df.to_csv('stocks_posts.csv', index=False)
print("Data has been saved to 'stocks_posts.csv' successfully!")

MissingRequiredAttributeException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the Reddit class constructor, or as an environment variable.

In [None]:
import pandas as pd
df = pd.read_csv('stocks_posts.csv')
print(df.head())  

In [None]:
import pandas as pd
df = pd.read_csv('stocks_posts.csv')  
print(df.head())
print(df.info())

In [None]:
for post in subreddit.top(limit=100):
    try:
        post_data = {
            'title': clean_text(post.title),
            'score': post.score,
            'url': post.url,
            'content': clean_text(post.selftext if post.selftext else ''),
            'title_sentiment': get_sentiment(post.title),
            'content_sentiment': get_sentiment(post.selftext if post.selftext else ''),
            'created_date': pd.to_datetime(post.created_utc, unit='s'),
            'num_comments': post.num_comments,
            'upvote_ratio': post.upvote_ratio,
        }
        posts.append(post_data)
    except Exception as e:
        print(f"Error processing post: {e}")

In [None]:
print(f"Authenticated: {reddit.read_only}") 

In [None]:
subreddit = reddit.subreddit("stocks")
for post in subreddit.top(limit=5):
    print(post.title)

In [None]:
posts = []
for post in subreddit.top(limit=100):
    try:
        post_data = {
            'title': clean_text(post.title),
            'score': post.score,
            'url': post.url,
            'content': clean_text(post.selftext if post.selftext else ''),
            'title_sentiment': get_sentiment(post.title),
            'content_sentiment': get_sentiment(post.selftext if post.selftext else ''),
            'created_date': pd.to_datetime(post.created_utc, unit='s'),
            'num_comments': post.num_comments,
            'upvote_ratio': post.upvote_ratio,
        }
        posts.append(post_data)
        print(f"Processed post: {post.title[:30]}...")  
    except Exception as e:
        print(f"Error processing post: {e}")

In [None]:
if not posts:
    print("No posts were processed.")
else:
    print(f"Processed {len(posts)} posts.")
    df = pd.DataFrame(posts)
    df.to_csv('stocks_posts.csv', index=False)
    print("Data has been saved to 'stocks_posts.csv' successfully!")

In [None]:
import pandas as pd
df = pd.read_csv('stocks_posts.csv')

In [None]:
df

In [None]:
print(df.isnull().sum())

In [None]:
df=df.dropna()

In [None]:
print(df.isnull().sum())

In [None]:
print(f"number of duplicates:{df.duplicated().sum()}")

In [None]:
df.describe()

In [None]:
df['title_sentiment'].unique()

In [None]:
df['content_sentiment'].unique()

In [None]:
print(df.dtypes)

In [None]:
df

In [None]:
df['title_sentiment']=df['title_sentiment'].fillna(0)

In [None]:
df['content_sentiment']=df['content_sentiment'].fillna(0)

In [None]:
df.to_csv('stocks_posts_cleaned.csv', index=False)
print("Cleaned data saved to 'stocks_posts_cleaned.csv'.")

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['title_sentiment'], bins=20, alpha=0.7, label='Title Sentiment')
plt.hist(df['content_sentiment'], bins=20, alpha=0.7, label='Content Sentiment')
plt.legend()
plt.title('Sentiment Score Distribution')
plt.show()

In [None]:
conda install yfinance

In [None]:
import yfinance as yf
stock_data = yf.download("AAPL", start="2021-01-01", end="2023-12-31")
stock_data = stock_data[['Close']]
stock_data.reset_index(inplace=True)
stock_data.columns = ['date', 'close_price']
print(stock_data.head())

In [None]:
print(df.columns)

In [None]:
df['created_date'] = pd.to_datetime(df['created_date'], errors='coerce')

In [None]:
df.rename(columns={'created_date': 'date'}, inplace=True)

In [None]:
print(df.columns)

In [None]:
df

In [None]:
print("Columns in df:", df.columns)
print("Columns in stock_data:", stock_data.columns)

In [None]:
post_data['created_date'] = post.created_utc

In [None]:
print(df.head())
print(stock_data.head())

In [None]:
print(stock_data.columns)
print(stock_data.index)

In [None]:
print(stock_data.columns)

In [None]:
stock_data['date'] = pd.to_datetime(stock_data['date']).dt.date

In [None]:
print(df.columns)

In [None]:
df

In [None]:
print(df['date'].isnull().sum())
print(stock_data['date'].isnull().sum())

In [None]:
print(stock_data['date'].head())
print(df['date'].head())

In [None]:
df['date'] = pd.to_datetime(df['date']).dt.date

In [None]:
common_dates = set(df['date']).intersection(set(stock_data['date']))
print("Common Dates:", common_dates)
print("Number of Common Dates:", len(common_dates))

In [None]:
print("Columns in df:", df.columns)
print("Columns in stock_data:", stock_data.columns)


In [None]:
df = df.dropna(subset=['date'])
print("df 'date' column:", df['date'].head())
print("stock_data 'date' column:", stock_data['date'].head())

In [None]:
merged_data = pd.merge(df, stock_data, left_on='date', right_on='date', how='inner')
print("Merged DataFrame:")
print(merged_data.head())

In [None]:
print(merged_data.describe())

In [None]:
print(merged_data.columns)

In [None]:
merged_data

In [None]:
correlation = merged_data[['title_sentiment', 'content_sentiment', 'close_price']].corr()
print(correlation)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(merged_data['date'], merged_data['close_price'], label='Close price')
plt.plot(merged_data['date'], merged_data['title_sentiment'], label='Title Sentiment', alpha=0.7)
plt.plot(merged_data['date'], merged_data['content_sentiment'], label='Content Sentiment', alpha=0.7)
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Stock Price vs Sentiment Scores')
plt.legend()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))

ax1.plot(merged_data['date'], merged_data['close_price'], label='Close Price', color='blue')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

ax2 = ax1.twinx()
ax2.plot(merged_data['date'], merged_data['title_sentiment'], label='Title Sentiment', color='green', alpha=0.7)
ax2.plot(merged_data['date'], merged_data['content_sentiment'], label='Content Sentiment', color='orange', alpha=0.7)
ax2.set_ylabel('Sentiment Scores', color='green')
ax2.tick_params(axis='y', labelcolor='green')

plt.title('Stock Price vs Sentiment Scores')
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Normalize sentiment scores to the range of stock prices (0 to 100 for example)
merged_data['title_sentiment_normalized'] = (merged_data['title_sentiment'] + 1) * 50  # Normalize to 0-100
merged_data['content_sentiment_normalized'] = (merged_data['content_sentiment'] + 1) * 50  # Normalize to 0-100

plt.figure(figsize=(10, 5))

# Plot Close Price
plt.plot(merged_data['date'], merged_data['close_price'], label='Close Price', color='blue')

# Plot Sentiment Scores
plt.plot(merged_data['date'], merged_data['title_sentiment_normalized'], label='Title Sentiment', alpha=0.7, color='orange')
plt.plot(merged_data['date'], merged_data['content_sentiment_normalized'], label='Content Sentiment', alpha=0.7, color='green')

plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Stock Price vs Sentiment Scores')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
plt.hist(merged_data['title_sentiment'], bins=20, alpha=0.7, label='Title Sentiment')
plt.hist(merged_data['content_sentiment'], bins=20, alpha=0.7, label='Content Sentiment')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(merged_data['content_sentiment'], merged_data['close_price'], alpha=0.5)
plt.xlabel('Content Sentiment')
plt.ylabel('Stock Volume')
plt.title('Content Sentiment vs Stock Volume')
plt.show()

In [None]:
merged_data['price_change'] = merged_data['close_price'].diff().shift(-1)
merged_data['price_direction'] = merged_data['price_change'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

features = merged_data[['title_sentiment', 'content_sentiment', 'close_price']]
target = merged_data['price_direction']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

In [None]:
import pandas as pd
stock_data_cleaned = pd.read_csv('stocks_posts_cleaned.csv')
stock_data_cleaned.to_excel('stocks_posts_cleaned.xlsx', index=False, engine='openpyxl')
print("CSV file has been successfully converted to Excel.")