# Import Module

In [1]:
import pandas as pd
import numpy as np

from math import sqrt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier
from sklearn.metrics import mean_squared_error, f1_score
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load Data

In [16]:
data_1 = pd.read_csv("data//data_running_v1.csv")

In [17]:
data_2 = pd.read_csv("data//data_v5.csv")

## Data Prepare - sample_data

In [31]:
sample_data_1 = data_1[["title","created_utc","subreddit_subscribers","num_comments"]]
sample_data_2 = data_2[["title","created_utc","subreddit_subscribers","num_comments"]]

print(sample_data_1.head())
print(sample_data_2.head())

                                               title   created_utc  \
0   Official Q&amp;A for Saturday, November 04, 2023  1.699070e+09   
1  That Frigid Season is Here! - Annual cold / co...  1.698761e+09   
2  Super shoes have ‘blown distance running into ...  1.699103e+09   
3  What cool things do you with your medals? The ...  1.699108e+09   
4   What marathon training plan would you recommend?  1.699120e+09   

   subreddit_subscribers  num_comments  
0                2684021            90  
1                2684021           215  
2                2684021           120  
3                2684021            61  
4                2684021            12  
                                               title   created_utc  \
0   My New Favorite Running Sock! | Here's Jogology!  1.698928e+09   
1                  How to train to run 7 minute mile  1.698012e+09   
2  10 amazing healthy foods and herbs that will h...  1.697159e+09   
3           Yes, no, why - running smartwatch to buy 

In [39]:
sample_data = [sample_data_1, sample_data_2]
 
sample_data = pd.concat(sample_data)

### Convert UTC Time to Normal Datetime

In [40]:
sample_data['create_date'] = pd.to_datetime(sample_data['created_utc'],unit = 's')
sample_data.head()

Unnamed: 0,title,created_utc,subreddit_subscribers,num_comments,create_date
0,"Official Q&amp;A for Saturday, November 04, 2023",1699070000.0,2684021,90,2023-11-04 04:01:11
1,That Frigid Season is Here! - Annual cold / co...,1698761000.0,2684021,215,2023-10-31 14:11:22
2,Super shoes have ‘blown distance running into ...,1699103000.0,2684021,120,2023-11-04 13:05:18
3,What cool things do you with your medals? The ...,1699108000.0,2684021,61,2023-11-04 14:24:33
4,What marathon training plan would you recommend?,1699120000.0,2684021,12,2023-11-04 17:48:33


### Split Date, Time and drop unix_time 

In [41]:
sample_data['new_date'] = sample_data['create_date'].dt.date
sample_data['new_time'] = sample_data['create_date'].dt.time
del sample_data['created_utc']
sample_data.head()

Unnamed: 0,title,subreddit_subscribers,num_comments,create_date,new_date,new_time
0,"Official Q&amp;A for Saturday, November 04, 2023",2684021,90,2023-11-04 04:01:11,2023-11-04,04:01:11
1,That Frigid Season is Here! - Annual cold / co...,2684021,215,2023-10-31 14:11:22,2023-10-31,14:11:22
2,Super shoes have ‘blown distance running into ...,2684021,120,2023-11-04 13:05:18,2023-11-04,13:05:18
3,What cool things do you with your medals? The ...,2684021,61,2023-11-04 14:24:33,2023-11-04,14:24:33
4,What marathon training plan would you recommend?,2684021,12,2023-11-04 17:48:33,2023-11-04,17:48:33


In [42]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97999 entries, 0 to 47999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   title                  97999 non-null  object        
 1   subreddit_subscribers  97999 non-null  int64         
 2   num_comments           97999 non-null  int64         
 3   create_date            97999 non-null  datetime64[ns]
 4   new_date               97999 non-null  object        
 5   new_time               97999 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 5.2+ MB


In [49]:
train_data = sample_data.drop_duplicates('title')

In [51]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1231 entries, 0 to 383
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   title                  1231 non-null   object        
 1   subreddit_subscribers  1231 non-null   int64         
 2   num_comments           1231 non-null   int64         
 3   create_date            1231 non-null   datetime64[ns]
 4   new_date               1231 non-null   object        
 5   new_time               1231 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 67.3+ KB


In [50]:
train_data.to_csv('train_data.csv', sep='|',encoding='utf-8')

# NLP Process