In [37]:
import numpy as np
import pandas as pd
from textblob import TextBlob

# Loading the dataset and viewing basic info
df = pd.read_csv('behaviour_simulation_train.csv', encoding='cp1252')
print(df.shape)
df.info()
df.isna().sum()

(300000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                300000 non-null  int64 
 1   date              300000 non-null  object
 2   likes             300000 non-null  int64 
 3   content           300000 non-null  object
 4   username          300000 non-null  object
 5   media             300000 non-null  object
 6   inferred company  300000 non-null  object
dtypes: int64(2), object(5)
memory usage: 16.0+ MB


id                  0
date                0
likes               0
content             0
username            0
media               0
inferred company    0
dtype: int64

In [38]:
# removing whitespace from content and converting to lowercase
df['content'] = df['content'].astype(str).str.strip().str.lower()

# creating timestamp related columns
df['datetime'] = pd.to_datetime(df['date'], errors='coerce')
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.day_name()

# creating columns for word count, character count, has_mention, has_hashtag
df['word_count'] = df['content'].apply(lambda x: len(x.split()))
df['char_count'] = df['content'].apply(len)

# create a column for sentiment polarity
df['sentiment'] = df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [39]:
df['username'].nunique()

2449

In [40]:
df['media_type'] = df['media'].str.split('(').str[0].str[1:]

In [41]:
# Encoding features for training
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['company_encoded'] = le.fit_transform(df['inferred company'])
df['username_encoded'] = le.fit_transform(df['username'])
df['day_of_week_encoded'] = le.fit_transform(df['day_of_week'])
df['media_encoded'] = le.fit_transform(df['media_type'])

df['log_likes'] = df['likes'].apply(lambda x: np.log1p(x))


In [42]:
# Training the model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

X = df[['company_encoded', 'username_encoded', 'media_encoded', 'word_count', 'char_count', 'sentiment', 'hour', 'day_of_week_encoded']]
y = df['log_likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

preds = model.predict(X_test)
preds = np.expm1(preds)
y_test = np.expm1(y_test)

rmse = mean_squared_error(y_test, preds)**0.5
print("RMSE:", rmse)

RMSE: 3577.4343447494543


In [61]:
print(y_test[0])

1.0


In [57]:
for i in range(len(y_test)):
    print(y_test[i], preds[i])

1.0 74.23354793235377


KeyError: 1