In [9]:
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DNK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
user_data = pd.read_csv('user_data.csv')
post_text_df = pd.read_csv('post_text_df.csv')
feed_data = pd.read_csv('feed_data.csv')
count_actions = pd.read_csv('counT_actions.csv')

In [3]:
print(user_data.shape)
print(post_text_df.shape)
print(feed_data.shape)
print(count_actions.shape)

(163205, 8)
(7023, 3)
(1000000, 5)
(163205, 2)


In [4]:
count_actions['count_actions'] = count_actions['count']
count_actions = count_actions.drop('count', axis=1)
count_actions

Unnamed: 0,user_id,count_actions
0,200,401
1,201,748
2,202,724
3,203,382
4,204,161
...,...,...
163200,168548,382
163201,168549,274
163202,168550,407
163203,168551,525


In [5]:
user_data = pd.merge(user_data, count_actions, on='user_id', how='left')
user_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401
1,201,0,37,Russia,Abakan,0,Android,ads,748
2,202,1,17,Russia,Smolensk,4,Android,ads,724
3,203,0,18,Russia,Moscow,1,iOS,ads,382
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161
...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382
163201,168549,0,18,Russia,Tula,2,Android,organic,274
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525


In [20]:
wnl = WordNetLemmatizer()

def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    return ' '.join([token.lemmatize(x) for x in line.split()])

def tfidf_func(df: pd.DataFrame, column: str):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                       preprocessor=preprocessing, 
                                       max_features=10000, 
                                       min_df=5, 
                                       max_df=0.85)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[column])

    prefix = "tfidf_"
    columns_with_prefix = [prefix + col for col in tfidf_vectorizer.get_feature_names_out()]

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=columns_with_prefix)
    
    return tfidf_df

In [19]:
tfidf_df = tfidf_func(post_text_df, 'text')

tfidf_df.head()

Unnamed: 0,tfidf_00,tfidf_000,tfidf_000m,tfidf_007,tfidf_01,tfidf_03,tfidf_04,tfidf_05,tfidf_06,tfidf_07,...,tfidf_zeppelin,tfidf_zero,tfidf_zeta,tfidf_zhang,tfidf_zimbabwe,tfidf_zip,tfidf_zombie,tfidf_zone,tfidf_zoom,tfidf_zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.144911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.055949,0.0,0.0,0.0,0.058706,0.0,0.0,0.0,0.05528,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056265
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def get_PCA(n_components: int, data: pd.DataFrame):
    
    centered = data - data.mean()
    pca = PCA(n_components) 
    return pca.fit_transform(centered)

In [22]:
pca_matrix = get_PCA(50, tfidf_df)

col_names = [f'feature_{i + 1}' for i in range(50)]

pca_df = pd.DataFrame(pca_matrix, columns=col_names)

pca_df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,0.005147,0.194684,0.026514,-0.073551,-0.149517,-0.025243,0.048232,-0.171348,-0.132980,-0.096592,...,0.002770,0.015150,-0.037546,0.014755,-0.030483,0.013588,0.020241,-0.023167,0.032114,0.021335
1,-0.000803,0.218085,0.067561,0.077333,-0.054572,-0.002843,0.005914,-0.026982,-0.014664,0.013805,...,0.007674,-0.082471,0.014627,0.017087,0.022628,-0.034949,-0.038008,0.026565,0.010053,-0.009858
2,-0.005729,0.163478,0.016924,-0.098530,-0.153900,-0.024024,0.039049,-0.133561,-0.084596,-0.096298,...,-0.038628,0.027905,-0.016978,0.010790,0.050866,0.010315,-0.017937,-0.017863,-0.000048,0.023703
3,0.010938,0.168339,0.025062,-0.063091,-0.153456,-0.016477,0.054518,-0.080653,-0.020960,-0.058200,...,-0.023014,0.098884,0.018685,-0.017327,-0.031135,-0.045588,-0.033993,-0.009839,0.036338,0.011815
4,0.000350,0.122627,0.010034,-0.040646,-0.059208,-0.006179,-0.003250,-0.012157,0.010298,0.008271,...,0.010623,0.015857,0.023957,0.025522,-0.014381,0.012647,0.000772,-0.014178,0.012667,-0.005298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-0.164127,-0.151603,0.168022,-0.013667,0.019660,-0.006831,0.012408,0.029409,-0.024325,-0.037183,...,-0.024458,0.042370,-0.075578,-0.026011,0.026100,-0.024041,0.041855,-0.008036,0.016779,-0.000262
7019,-0.136358,-0.129865,0.106072,-0.004081,-0.055466,-0.004484,-0.006954,-0.005667,-0.015173,0.025596,...,-0.027759,0.015210,-0.017865,0.009113,-0.002165,0.023063,-0.011472,0.035578,0.029828,0.028756
7020,-0.102380,-0.070634,-0.168157,0.079847,-0.034367,-0.001904,-0.012952,0.018657,-0.002255,-0.046363,...,-0.061015,0.022039,0.042510,-0.002182,-0.045331,-0.026317,0.020485,-0.027644,0.008355,0.059747
7021,-0.112783,-0.035907,-0.101419,0.058777,0.020218,-0.005854,0.033726,0.004224,-0.036691,-0.005622,...,-0.024241,-0.014465,-0.029016,0.009381,-0.003801,-0.025043,0.042122,0.061133,-0.022629,-0.024155


In [23]:
new_post_text_df = post_text_df.drop('text', axis=1)

new_post_text_df = pd.concat([new_post_text_df, pca_df], axis=1)

new_post_text_df.to_csv('new_post_text_df.csv', index=False)

category_features = ['topic']


FEED_DATA PREPROCESSING. DATETIME

In [24]:
feed_data['timestamp'] = pd.to_datetime(feed_data['timestamp'])

feed_data = feed_data.drop('action', axis=1)

In [25]:
feed_data['month'] = feed_data['timestamp'].dt.month
feed_data['day'] = feed_data['timestamp'].dt.day
feed_data['second'] = feed_data['timestamp'].dt.second
feed_data['weekday'] = feed_data['timestamp'].dt.weekday
feed_data['is_weekend'] = feed_data['weekday'].isin([5, 6]).astype(int)
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
feed_data['part_of_day'] = pd.cut(feed_data['timestamp'].dt.hour, bins=bins, labels=labels, right=False)

In [26]:
new_feed_data = feed_data

In [39]:
new_feed_data.to_csv('new_feed_data.csv', index=False)

In [27]:
for i in ['month', 'day', 'weekday', 'part_of_day']:
    category_features.append(i)
    
print(category_features)

['topic', 'month', 'day', 'weekday', 'part_of_day']


USER_DATA PREPROCESSING

In [35]:
bins = [0, 25, 50, float('inf')]
labels = ['young', 'adult', 'old']
user_data['category_of_age'] = pd.cut(user_data['age'], bins=bins, labels=labels, right=False)

In [36]:
new_user_data = user_data

In [37]:
new_user_data.to_csv('new_user_data.csv', index=False)

In [38]:
for i in user_data.columns:
    if i not in ['age', 'gender', 'count_actions']:
        category_features.append(i)
        
print(category_features)

['topic', 'month', 'day', 'weekday', 'part_of_day', 'user_id', 'country', 'city', 'exp_group', 'os', 'source', 'category_of_age']
