In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Set a project path in your Drive (change username if you prefer)
PROJECT = "/content/drive/MyDrive/datascience/ds_aditya_sawant"
import os
os.makedirs(os.path.join(PROJECT, "csv_files"), exist_ok=True)
os.makedirs(os.path.join(PROJECT, "outputs"), exist_ok=True)

print("Project folder:", PROJECT)
print("csv_files:", os.path.join(PROJECT, "csv_files"))
print("outputs:", os.path.join(PROJECT, "outputs"))


Mounted at /content/drive
Project folder: /content/drive/MyDrive/datascience/ds_aditya_sawant
csv_files: /content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files
outputs: /content/drive/MyDrive/datascience/ds_aditya_sawant/outputs


In [3]:
import os
DATA_DIR = "/content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files"
print(os.listdir(DATA_DIR))


['historical_data.csv', 'fear_greed_index.csv']


In [4]:
import pandas as pd
import numpy as np
import os

DATA_DIR = "/content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files"

# load using the exact filenames present
hist_path = os.path.join(DATA_DIR, "historical_data.csv")
fg_path   = os.path.join(DATA_DIR, "fear_greed_index.csv")

print("Using trade file:", hist_path)
print("Using FG file:", fg_path)

# load
trades = pd.read_csv(hist_path)
fg = pd.read_csv(fg_path)

# parse trade timestamps (handle unix ms)
if trades['Timestamp'].dtype.kind in ('i','f'):
    trades['Timestamp_dt'] = pd.to_datetime(trades['Timestamp'], unit='ms', errors='coerce')
else:
    trades['Timestamp_dt'] = pd.to_datetime(trades['Timestamp'], errors='coerce')

trades['trade_date'] = trades['Timestamp_dt'].dt.date

# parse FG date (fg has 'date' column)
if 'date' in fg.columns:
    fg['fg_date'] = pd.to_datetime(fg['date'], errors='coerce').dt.date
else:
    fg['fg_date'] = pd.to_datetime(fg.iloc[:,0], errors='coerce').dt.date

# numeric sentiment
if 'value' in fg.columns:
    fg['value'] = pd.to_numeric(fg['value'], errors='coerce')

# merge on date
merged = trades.merge(fg[['fg_date','value','classification']].drop_duplicates('fg_date'),
                      left_on='trade_date', right_on='fg_date', how='left')

# numeric housekeeping
if 'Size USD' in merged.columns:
    merged['notional'] = pd.to_numeric(merged['Size USD'], errors='coerce')
else:
    merged['notional'] = (pd.to_numeric(merged.get('Execution Price', np.nan), errors='coerce').abs()
                          * pd.to_numeric(merged.get('Size Tokens', np.nan), errors='coerce').abs())

merged['closed_pnl'] = pd.to_numeric(merged.get('Closed PnL', merged.get('closed_pnl', np.nan)), errors='coerce')
merged['return_pct'] = merged['closed_pnl'] / merged['notional'].replace({0:np.nan})
merged['profitable'] = (merged['closed_pnl'] > 0).astype(int)

# sentiment fallback mapping if needed
if merged['value'].isna().all() and 'classification' in merged.columns:
    mapping = {}
    for v in merged['classification'].dropna().unique():
        s = str(v).lower()
        if 'greed' in s: mapping[v]=60
        elif 'fear' in s: mapping[v]=30
        elif 'neutral' in s: mapping[v]=55
    if mapping:
        merged['value'] = merged['value'].fillna(merged['classification'].map(mapping))

merged['_sentiment_num'] = pd.to_numeric(merged['value'], errors='coerce')
merged['sentiment_group'] = pd.cut(merged['_sentiment_num'], bins=[-999,49,59,999], labels=['Fear','Neutral','Greed'])

# lag & rolling per account
merged = merged.sort_values(['Account','Timestamp_dt'])
merged['sentiment_lag_1'] = merged.groupby('Account')['_sentiment_num'].shift(1)
merged['sentiment_lag_3'] = merged.groupby('Account')['_sentiment_num'].shift(3)
merged['sentiment_ma_7'] = merged.groupby('Account')['_sentiment_num'].transform(lambda x: x.rolling(7, min_periods=1).mean())

# save processed files to Drive csv_files
merged_path = os.path.join(DATA_DIR, "merged_trades.csv")
final_ml_path = os.path.join(DATA_DIR, "final_ml_dataset.csv")
merged.to_csv(merged_path, index=False)
merged.dropna(subset=['_sentiment_num','notional','closed_pnl']).to_csv(final_ml_path, index=False)

# diagnostics
print("Saved merged:", merged_path)
print("Saved final ML:", final_ml_path)
print("Merged shape:", merged.shape)
print("Rows with sentiment:", merged['_sentiment_num'].notna().sum())


Using trade file: /content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files/historical_data.csv
Using FG file: /content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files/fear_greed_index.csv
Saved merged: /content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files/merged_trades.csv
Saved final ML: /content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files/final_ml_dataset.csv
Merged shape: (211224, 30)
Rows with sentiment: 184263


In [5]:
# Create and save plots (run in Colab)
import os, matplotlib.pyplot as plt, seaborn as sns, numpy as np, pandas as pd

DATA_DIR = "/content/drive/MyDrive/datascience/ds_aditya_sawant/csv_files"
OUT_DIR  = "/content/drive/MyDrive/datascience/ds_aditya_sawant/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

merged = pd.read_csv(os.path.join(DATA_DIR, "merged_trades.csv"))
df = merged.copy()
df['_sentiment_num'] = pd.to_numeric(df['_sentiment_num'], errors='coerce')
df['profit'] = pd.to_numeric(df['closed_pnl'], errors='coerce')
df['profitable'] = (df['profit'] > 0).astype(int)
df['is_buy'] = df['Side'].astype(str).str.lower().eq('buy').astype(int)

sns.set(style='whitegrid')

# 1) Profitability rate by sentiment
plt.figure(figsize=(6,4))
prof_by = df.groupby('sentiment_group')['profitable'].mean().reindex(['Fear','Neutral','Greed'])
sns.barplot(x=prof_by.index, y=prof_by.values)
plt.ylim(0,1); plt.title("Profitability by Sentiment"); plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"profitability_by_sentiment.png")); plt.close()

# 2) Average profit by sentiment
plt.figure(figsize=(6,4))
avg_profit = df.groupby('sentiment_group')['profit'].mean().reindex(['Fear','Neutral','Greed'])
sns.barplot(x=avg_profit.index, y=avg_profit.values)
plt.title("Average Profit by Sentiment"); plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"avg_profit_by_sentiment.png")); plt.close()

# 3) Buy ratio by sentiment
plt.figure(figsize=(6,4))
buy_ratio = df.groupby('sentiment_group')['is_buy'].mean().reindex(['Fear','Neutral','Greed'])
sns.barplot(x=buy_ratio.index, y=buy_ratio.values)
plt.ylim(0,1); plt.title("Buy Ratio by Sentiment"); plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"buy_ratio_by_sentiment.png")); plt.close()

# 4) Sentiment distribution
plt.figure(figsize=(8,4))
sns.histplot(df['_sentiment_num'].dropna(), bins=20)
plt.title("Sentiment distribution"); plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"sentiment_distribution.png")); plt.close()

# 5) Top accounts by avg profit (min 5 trades)
acct = df.groupby('Account').agg(avg_profit=('profit','mean'), trades=('profit','count')).reset_index()
top = acct[acct['trades']>=5].sort_values('avg_profit', ascending=False).head(15)
plt.figure(figsize=(10,5))
sns.barplot(data=top, x='avg_profit', y='Account')
plt.title("Top 15 Accounts by Avg Profit"); plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"top15_accounts_avg_profit.png")); plt.close()

print("Plots saved to:", OUT_DIR)
print(os.listdir(OUT_DIR))


Plots saved to: /content/drive/MyDrive/datascience/ds_aditya_sawant/outputs
['profitability_by_sentiment.png', 'avg_profit_by_sentiment.png', 'buy_ratio_by_sentiment.png', 'sentiment_distribution.png', 'top15_accounts_avg_profit.png']
