In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
import pandas as pd
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/embeddings/embeddings.csv


In [2]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [3]:
pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.5
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import os
import time
import pickle
from catboost import CatBoostClassifier
import pandas as pd
from sqlalchemy import create_engine

def get_model_path(path: str) -> str:
    if os.environ.get("IS_LMS") == "1":  # проверяем где выполняется код в лмс, или локально. Немного магии
        MODEL_PATH = '/workdir/user_input/model'
    else:
        MODEL_PATH = path
    return MODEL_PATH

def load_models():
    from_file = CatBoostClassifier()
    # LOAD MODEL HERE PLS :)
    loaded_model = from_file.load_model(get_model_path("catboost_model"))
    return loaded_model

def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)


def batch_load_sql_feed_data(query: str, post_text, user_data, limit) -> pd.DataFrame:
    CHUNKSIZE = 10000000
    engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    
    iteration = 0
    final_amount = int(query.split("LIMIT ")[1])
    start_time = time.time()
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        
        df = post_text['post_id'].to_frame().merge(chunk_dataframe[['target', 'post_id', 'user_id', 'action']], how='inner', on='post_id').merge(user_data['user_id'].to_frame()
                                                                                                                          , how='inner', on='user_id').drop_duplicates().fillna(0)
        #df['target'] = ((df['action'] == 'like') | (df['target'] == 1)).astype(int)
        
        
        unique_pairs = []
        for id in set(df['user_id']):
            user_df = df[df['user_id'] == id]
            if (user_df[user_df['target'] == 1].shape[0] >= limit) and (user_df[user_df['target'] == 0].shape[0] >= limit):
                unique_pairs.append(user_df[user_df['target'] == 1].sample(n=limit, random_state=42))
                unique_pairs.append(user_df[user_df['target'] == 0].sample(n=limit, random_state=42))
        data_to_learn = pd.concat(unique_pairs, ignore_index=True)
        df = data_to_learn.drop('action', axis=1)
        
        chunks.append(df)
        iteration += 1
        print(f'{iteration * CHUNKSIZE * 100 / final_amount}% progress, size = {sum([df_size.shape[0] for df_size in chunks])}, iteration time = {time.time() - start_time} seconds')
        start_time = time.time()
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [5]:
feed_data_limit = 100000000
user_data_limit = 163205#114903 163205
post_text_limit = 7023#6924 7023
embeddings_pca_components = 80
group_limit = 20

In [6]:
%%time
user_data = batch_load_sql(f"SELECT * FROM public.user_data LIMIT {user_data_limit}")
user_data = user_data.iloc[user_data.drop('user_id', axis=1).drop_duplicates().index.tolist()]

post_text = batch_load_sql(f"SELECT * FROM public.post_text_df LIMIT {post_text_limit}")
post_text = post_text.iloc[post_text.drop('post_id', axis=1).drop_duplicates().index.tolist()].dropna()

CPU times: user 1.05 s, sys: 120 ms, total: 1.17 s
Wall time: 11.3 s


In [7]:
%%time
#SAVE_DF_TO_SQL

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)

embeddings = np.genfromtxt('../input/embeddings/embeddings.csv', delimiter=',')
embeddings = pd.DataFrame(embeddings).subtract(pd.DataFrame(embeddings).mean())


embeddings = PCA(n_components=embeddings_pca_components).fit_transform(embeddings)

embeddings = pd.DataFrame(embeddings)
embeddings.index = post_text.index
embeddings['emb_id'] = embeddings.index

#pd.concat([user_data, post_text, embeddings])
embeddings.to_sql('denis21.97@mail.ru_lesson_22', con=engine, if_exists="replace", index=False, method='multi')

#SAVE_DF_TO_SQL

CPU times: user 25.8 s, sys: 1.1 s, total: 26.9 s
Wall time: 37.5 s


In [8]:
for col in ['city', 'country', 'age']:
    codes = user_data[col].astype('category').cat.codes
    scaler = StandardScaler()
    user_data[col] = scaler.fit_transform(codes.to_frame())

In [9]:
user_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,0.539111,0.067687,-1.243800,3,Android,ads
1,201,0,0.816628,0.067687,-1.839694,0,Android,ads
2,202,1,-1.033485,0.067687,0.849631,4,Android,ads
3,203,0,-0.940979,0.067687,-0.048341,1,iOS,ads
4,204,0,0.724123,0.067687,-1.726759,3,Android,ads
...,...,...,...,...,...,...,...,...
163197,168545,1,-0.293440,0.067687,-1.529352,4,iOS,organic
163198,168546,1,0.631617,0.067687,-1.201564,4,Android,organic
163200,168548,0,0.724123,0.067687,-0.809505,4,Android,organic
163202,168550,1,1.186651,0.067687,1.553869,4,Android,organic


In [10]:
one_hot_columns = ['gender', 'os', 'exp_group', 'source']
dummies = pd.get_dummies(user_data[one_hot_columns].astype(object), drop_first=True)
user_data[dummies.columns] = dummies
user_data.drop(one_hot_columns, axis=1, inplace=True)

In [11]:
user_data

Unnamed: 0,user_id,age,country,city,gender_1,os_iOS,exp_group_1,exp_group_2,exp_group_3,exp_group_4,source_organic
0,200,0.539111,0.067687,-1.243800,1,0,0,0,1,0,0
1,201,0.816628,0.067687,-1.839694,0,0,0,0,0,0,0
2,202,-1.033485,0.067687,0.849631,1,0,0,0,0,1,0
3,203,-0.940979,0.067687,-0.048341,0,1,1,0,0,0,0
4,204,0.724123,0.067687,-1.726759,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
163197,168545,-0.293440,0.067687,-1.529352,1,1,0,0,0,1,1
163198,168546,0.631617,0.067687,-1.201564,1,0,0,0,0,1,1
163200,168548,0.724123,0.067687,-0.809505,0,0,0,0,0,1,1
163202,168550,1.186651,0.067687,1.553869,1,0,0,0,0,1,1


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, n_init=100, random_state=0).fit(user_data.drop('user_id', axis=1))
user_data['cluster'] = kmeans.predict(user_data.drop('user_id', axis=1))

In [13]:
one_hot_columns = ['cluster']
dummies = pd.get_dummies(user_data[one_hot_columns].astype(object), drop_first=True)
user_data[dummies.columns] = dummies
user_data.drop(one_hot_columns, axis=1, inplace=True)

from category_encoders import BinaryEncoder

#binary_encoder = BinaryEncoder(cols=['age', 'city', 'country'])
#new_data = binary_encoder.fit_transform(user_data[['age', 'city', 'country']])
#user_data = pd.concat([user_data, new_data], axis = 1)

#user_data = user_data.drop(['gender', 'exp_group', 'os', 'source', 'country', 'city', 'age'], axis=1)
#user_data

from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca_array = pca.fit_transform(user_data.drop('user_id', axis=1).subtract(user_data.drop('user_id', axis=1).mean()))

user_data.index = pd.DataFrame.from_records(pca_array).index
user_data = pd.concat([user_data['user_id'].copy(), pd.DataFrame.from_records(pca_array)], axis=1)

In [14]:
user_data

Unnamed: 0,user_id,age,country,city,gender_1,os_iOS,exp_group_1,exp_group_2,exp_group_3,exp_group_4,source_organic,cluster_1,cluster_2,cluster_3,cluster_4
0,200,0.539111,0.067687,-1.243800,1,0,0,0,1,0,0,0,1,0,0
1,201,0.816628,0.067687,-1.839694,0,0,0,0,0,0,0,0,1,0,0
2,202,-1.033485,0.067687,0.849631,1,0,0,0,0,1,0,0,0,1,0
3,203,-0.940979,0.067687,-0.048341,0,1,1,0,0,0,0,0,0,1,0
4,204,0.724123,0.067687,-1.726759,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163197,168545,-0.293440,0.067687,-1.529352,1,1,0,0,0,1,1,0,1,0,0
163198,168546,0.631617,0.067687,-1.201564,1,0,0,0,0,1,1,0,1,0,0
163200,168548,0.724123,0.067687,-0.809505,0,0,0,0,0,1,1,0,1,0,0
163202,168550,1.186651,0.067687,1.553869,1,0,0,0,0,1,1,1,0,0,0


In [15]:
post_text

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [16]:
!pip install category_encoders

[0m

In [17]:
#Embedding encoding
from sklearn.decomposition import PCA

embeddings = np.genfromtxt('../input/embeddings/embeddings.csv', delimiter=',')
embeddings = pd.DataFrame(embeddings).subtract(pd.DataFrame(embeddings).mean())


embeddings = PCA(n_components=embeddings_pca_components).fit_transform(embeddings)

embeddings = pd.DataFrame(embeddings)
embeddings.index = post_text.index
post_text = pd.concat([post_text, embeddings], axis=1)

In [18]:
embeddings.isnull().values.any(), post_text.isnull().values.any()

(False, False)

In [19]:
one_hot_columns = ['topic']
dummies = pd.get_dummies(post_text[one_hot_columns].astype(object), drop_first=True)
post_text[dummies.columns] = dummies
post_text.drop(one_hot_columns, axis=1, inplace=True)

from category_encoders import PolynomialEncoder

encoder = PolynomialEncoder(drop_invariant=True)
encoded_topics = encoder.fit_transform(post_text['topic'].unique())
#pd.DataFrame(index=[post_text['topic'].unique()])
encoder.get_feature_names()
encoded_topics['topic'] = post_text['topic'].unique()


post_text = pd.merge(post_text, encoded_topics, how='inner', on='topic')

plt.figure(figsize=(18, 10))

sb.heatmap(post_text.drop(['post_id', 'topic', 'text'], axis=1).corr(), cmap="YlGnBu", annot=True)

In [20]:
post_text

Unnamed: 0,post_id,text,0,1,2,3,4,5,6,7,...,76,77,78,79,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech
0,1,UK economy facing major risks\n\nThe UK manufa...,-0.201362,-2.497871,-4.421739,3.475615,4.448384,2.778310,0.343653,1.853580,...,0.321943,0.761889,0.137713,0.090725,0,0,0,0,0,0
1,2,Aids and climate top Davos agenda\n\nClimate c...,2.087656,-5.046058,-2.603661,-0.021889,1.696300,-1.406209,-0.451960,-0.421231,...,0.190539,0.451460,-0.303735,0.460018,0,0,0,0,0,0
2,3,Asian quake hits European shares\n\nShares in ...,-1.422821,1.007635,-4.857323,3.233500,1.447078,-0.640593,0.534364,1.613082,...,-0.166913,-0.025791,-0.046254,0.069063,0,0,0,0,0,0
3,4,India power shares jump on debut\n\nShares in ...,0.966748,-5.594855,-2.521049,5.650093,2.028688,0.605763,-0.158896,-0.621623,...,-0.238404,-0.405952,-0.619940,0.271458,0,0,0,0,0,0
4,5,Lacroix label bought by US firm\n\nLuxury good...,2.305621,-1.848822,-1.335231,0.336507,0.248042,0.646015,-2.126502,-1.623979,...,-0.264298,1.108525,-0.519656,0.191945,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",-3.700599,0.208601,1.302693,-1.823509,-2.265575,1.178967,-3.682232,-1.065777,...,0.366297,-0.239300,-0.738744,-0.251350,0,0,1,0,0,0
7019,7316,I give this movie 2 stars purely because of it...,-3.655442,2.609598,-1.053678,-0.537180,1.246383,-0.043620,-0.844619,-2.691123,...,0.424610,0.009402,0.266917,-0.002310,0,0,1,0,0,0
7020,7317,I cant believe this film was allowed to be mad...,-1.795682,4.181897,-2.098721,-1.326406,-1.308386,-1.960385,-0.451147,0.009920,...,-0.134895,0.147603,-0.102050,0.015894,0,0,1,0,0,0
7021,7318,The version I saw of this film was the Blockbu...,-4.688106,-0.813006,4.359813,1.438183,-0.027269,0.352093,1.454370,0.147835,...,-0.246286,0.242985,-0.021946,-0.060910,0,0,1,0,0,0


In [21]:
%%time

df = batch_load_sql_feed_data(f"SELECT * FROM public.feed_data LIMIT {feed_data_limit}", post_text, user_data, group_limit)

10.0% progress, size = 60780, iteration time = 251.5331997871399 seconds
20.0% progress, size = 122384, iteration time = 254.39681720733643 seconds
30.0% progress, size = 182192, iteration time = 236.85126733779907 seconds
40.0% progress, size = 243848, iteration time = 255.32499504089355 seconds
50.0% progress, size = 301044, iteration time = 232.50948119163513 seconds
60.0% progress, size = 362236, iteration time = 247.6327588558197 seconds
70.0% progress, size = 420388, iteration time = 246.52428221702576 seconds
80.0% progress, size = 459748, iteration time = 135.1030569076538 seconds
CPU times: user 31min 56s, sys: 47 s, total: 32min 43s
Wall time: 31min 6s


In [22]:
df

Unnamed: 0,post_id,target,user_id
0,5796,1,13064
1,6326,1,13064
2,5131,0,13064
3,1105,0,13064
4,2562,1,13065
...,...,...,...
459743,3600,0,78154
459744,1552,1,98302
459745,6500,1,98302
459746,2837,0,98302


In [23]:
limit = group_limit
unique_pairs = []
for id in set(df['user_id']):
    user_df = df[df['user_id'] == id]
    if (user_df[user_df['target'] == 1].shape[0] >= limit) and (user_df[user_df['target'] == 0].shape[0] >= limit):
        unique_pairs.append(user_df[user_df['target'] == 1].sample(n=limit, random_state=42))
        unique_pairs.append(user_df[user_df['target'] == 0].sample(n=limit, random_state=42))
data_to_learn = pd.concat(unique_pairs, ignore_index=True)
df = data_to_learn

In [24]:
df.shape

(459500, 3)

In [25]:
count_pairs = df.groupby(['user_id']).count()
count_pairs[count_pairs['target'] > 10]

Unnamed: 0_level_0,post_id,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [26]:
df = pd.merge(pd.merge(post_text, df, how='inner', on='post_id'), user_data, how='inner', on='user_id')

In [27]:
#df = df.drop(['timestamp', 'action'], axis=1).drop_duplicates()

df = post_text.merge(feed_data, how='inner', on='post_id').merge(user_data, how='inner', on='user_id').drop_duplicates().fillna(0)

limit = 5
chunks = []
for id in set(df['user_id']):
    user_df = df[df['user_id'] == id]
    if (user_df[user_df['target'] == 1].shape[0] >= limit) and (user_df[user_df['target'] == 0].shape[0] >= limit):
        chunks.append(user_df[user_df['target'] == 1].iloc[:limit])
        chunks.append(user_df[user_df['target'] == 0].iloc[:limit])
data_to_learn = pd.concat(chunks, ignore_index=True)
df = data_to_learn

data_to_learn_merged = pd.merge(data_to_learn, df, on=['user_id', 'post_id', 'target'], how="inner")

df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [28]:
df

Unnamed: 0,post_id,text,0,1,2,3,4,5,6,7,...,os_iOS,exp_group_1,exp_group_2,exp_group_3,exp_group_4,source_organic,cluster_1,cluster_2,cluster_3,cluster_4
0,1,UK economy facing major risks\n\nThe UK manufa...,-0.201362,-2.497871,-4.421739,3.475615,4.448384,2.778310,0.343653,1.853580,...,1,0,1,0,0,0,0,0,0,1
1,528,WMC profits up amid bid criticism\n\nAustralia...,-0.073658,-2.427082,-2.868966,4.219771,1.794207,0.080879,0.841885,-1.586332,...,1,0,1,0,0,0,0,0,0,1
2,1686,Hansen delays return until 2006\n\nBritish tri...,-1.906735,-2.436222,-3.835560,-1.161802,-0.016847,3.784282,-0.093581,1.453832,...,1,0,1,0,0,0,0,0,0,1
3,2821,Youre probably being sarcastic but can we plea...,4.751424,6.113358,0.885324,-1.354131,1.480178,-0.811218,0.147476,-0.570175,...,1,0,1,0,0,0,0,0,0,1
4,1,UK economy facing major risks\n\nThe UK manufa...,-0.201362,-2.497871,-4.421739,3.475615,4.448384,2.778310,0.343653,1.853580,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459495,7260,There are many different versions of this one ...,0.917119,2.050893,2.036139,-0.282366,1.406197,-2.279760,-3.106411,-2.154586,...,1,0,0,0,0,0,1,0,0,0
459496,6886,This movie is... horrible and wonderful at the...,-3.857872,0.185496,1.476427,0.483496,-0.701518,-1.114603,-0.429486,1.606428,...,0,0,1,0,0,0,1,0,0,0
459497,7006,This movie is a terrible waste of time. Althou...,-6.120048,2.019082,-1.457876,3.215819,1.110198,0.841159,-1.421299,2.024819,...,0,0,1,0,0,0,1,0,0,0
459498,7006,This movie is a terrible waste of time. Althou...,-6.120048,2.019082,-1.457876,3.215819,1.110198,0.841159,-1.421299,2.024819,...,0,0,1,0,0,0,1,0,0,0


In [29]:
target = df['target']
df_to_catboost = df.drop(['text', 'post_id', 'user_id', 'target'], axis=1)
categorial_features = ['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source', 'topic']
df_to_catboost

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,os_iOS,exp_group_1,exp_group_2,exp_group_3,exp_group_4,source_organic,cluster_1,cluster_2,cluster_3,cluster_4
0,-0.201362,-2.497871,-4.421739,3.475615,4.448384,2.778310,0.343653,1.853580,-0.801398,-0.898056,...,1,0,1,0,0,0,0,0,0,1
1,-0.073658,-2.427082,-2.868966,4.219771,1.794207,0.080879,0.841885,-1.586332,1.429029,-0.705965,...,1,0,1,0,0,0,0,0,0,1
2,-1.906735,-2.436222,-3.835560,-1.161802,-0.016847,3.784282,-0.093581,1.453832,-2.514608,0.404343,...,1,0,1,0,0,0,0,0,0,1
3,4.751424,6.113358,0.885324,-1.354131,1.480178,-0.811218,0.147476,-0.570175,-0.577149,-0.035983,...,1,0,1,0,0,0,0,0,0,1
4,-0.201362,-2.497871,-4.421739,3.475615,4.448384,2.778310,0.343653,1.853580,-0.801398,-0.898056,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459495,0.917119,2.050893,2.036139,-0.282366,1.406197,-2.279760,-3.106411,-2.154586,-1.740101,-2.174066,...,1,0,0,0,0,0,1,0,0,0
459496,-3.857872,0.185496,1.476427,0.483496,-0.701518,-1.114603,-0.429486,1.606428,-0.221046,-1.400103,...,0,0,1,0,0,0,1,0,0,0
459497,-6.120048,2.019082,-1.457876,3.215819,1.110198,0.841159,-1.421299,2.024819,-0.902010,2.056645,...,0,0,1,0,0,0,1,0,0,0
459498,-6.120048,2.019082,-1.457876,3.215819,1.110198,0.841159,-1.421299,2.024819,-0.902010,2.056645,...,0,0,1,0,0,0,1,0,0,0


from torch.utils.data import Dataset
df_to_model = df.drop(['post_id', 'user_id', 'topic', 'text'], axis=1)
target = df_to_model['target']
df_to_model = df_to_model.drop('target', axis=1) 
df_to_model.index = [i for i in range(df_to_model.shape[0])]
df_to_model.columns = [i for i in range(df_to_model.shape[1])]

class CustomDataset(Dataset):
    def __init__(self, df, target):
        self.labels = torch.tensor(target.to_numpy().reshape(-1)).float()
        print(self.labels)
        #self.labels = torch.full((df.shape[0], 6924), 0.5)
        self.dataset = torch.tensor(df.values).float()
        print(self.dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx],self.labels[idx] 

dataset = CustomDataset(df_to_catboost, target)
dataset.__getitem__(3649)[0].unsqueeze(1).shape

In [30]:
import torch.nn as nn

class Net(nn.Module):
    def __init__(self, input_size):
        super().__init__()

        self.dropout = nn.Sequential(
            nn.Dropout(p=0.2)
        )
        n = 32
        self.block1 = nn.Sequential(
            nn.Linear(input_size, n), 
            nn.BatchNorm1d(n),
            nn.ReLU()
        )
        self.block2 = nn.Sequential(
            nn.Linear(n, n * 2), 
            nn.BatchNorm1d(n * 2),
            nn.ReLU()
        )
        self.block3 = nn.Sequential(
            nn.Linear(n * 2, n * 2), 
            nn.BatchNorm1d(n * 2),
            nn.ReLU()
        )
        self.block4 = nn.Sequential(
            nn.Linear(n * 2, n * 4), 
            nn.BatchNorm1d(n * 4),
            nn.ReLU()
        )
        self.block5 = nn.Sequential(
            nn.Linear(n * 4, n * 4), 
            nn.BatchNorm1d(n * 4),
            nn.ReLU()
        )
        self.block6 = nn.Sequential(
            nn.Linear(n * 4, n * 8), 
            nn.BatchNorm1d(n * 8),
            nn.ReLU()
        )
        self.block7 = nn.Sequential(
            nn.Linear(n * 8, n * 8), 
            nn.BatchNorm1d(n * 8),
            nn.ReLU()
        )
        self.block8 = nn.Sequential(
            nn.Linear(n * 8, n * 16), 
            nn.BatchNorm1d(n * 16),
            nn.ReLU()
        )
        self.block9 = nn.Sequential(
            nn.Linear(n * 16, n * 16), 
            nn.BatchNorm1d(n * 16),
            nn.ReLU()
        )
        self.block10 = nn.Sequential(
            nn.Linear(n * 16, 1),
            nn.Softmax(dim=0)
            #nn.Sigmoid()
            #nn.Tanh()
        )

    def forward(self, x):
        output = self.block1(x)
        output = self.block2(output)
        #output = self.dropout(output)
        output = self.block3(output) + output
        #output = self.dropout(output)
        output = self.block4(output)
        #output = self.dropout(output)
        output = self.block5(output) + output
        #output = self.dropout(output)
        output = self.block6(output)
        #output = self.dropout(output)
        output = self.block7(output) + output
        #output = self.dropout(output)
        output = self.block8(output)
        #output = self.dropout(output)
        output = self.block9(output) + output
        #output = self.dropout(output)
        output = self.block10(output)
        

        return output

#!g1.1
from tqdm import tqdm


def train(model, train_loader) -> float:
    model.train()

    train_loss = 0
    total = 0
    correct = 0

    for x, y in tqdm(train_loader, desc='Train'):
        x, y = x.to(device), y.unsqueeze(1).to(device)
        
        optimizer.zero_grad()

        output = model(x.float())
        
        #print(output, y)
        
        loss = loss_fn(output, y)
        
        train_loss += loss.item()

        loss.backward()

        total += y.size(0)
        correct += (output == y).sum().item()

    train_loss /= len(train_loader)
    accuracy = correct / total

    return train_loss, accuracy

#!g1.1
@torch.inference_mode()
def evaluate(model, loader):
    model.eval()

    total_loss = 0
    total = 0
    correct = 0

    for x, y in tqdm(loader, desc='Evaluation'):
        x, y = x.to(device), y.unsqueeze(1).to(device)
        
        
        output = model(x.float())
        loss = loss_fn(output, y)

        total_loss += loss.item()

        #_, y_pred = torch.max(output, 1)
        total += y.size(0)
        correct += (output == y).sum().item()

    total_loss /= len(loader)
    accuracy = correct / total

    return total_loss, accuracy

#!g1.1
from IPython.display import clear_output
import matplotlib.pyplot as plt

def plot_stats(
    train_loss,
    valid_loss,
    train_accuracy,
    valid_accuracy,
    title
):
    plt.figure(figsize=(16, 8))

    plt.title(title + ' loss')

    plt.plot(train_loss, label='Train loss')
    plt.plot(valid_loss, label='Valid loss')
    plt.legend()
    plt.grid()

    plt.show()

    #plt.figure(figsize=(16, 8))

    #plt.title(title + ' accuracy')
    
    #plt.plot(train_accuracy, label='Train accuracy')
    #plt.plot(valid_accuracy, label='Valid accuracy')
    #plt.legend()
    #plt.grid()

    #plt.show()

def whole_train_valid_cycle(model, num_epochs, title, train_loader, valid_loader):
    train_loss_history, valid_loss_history = [], []
    train_accuracy_history, valid_accuracy_history = [], []

    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(model, train_loader)
        valid_loss, valid_accuracy = evaluate(model, valid_loader)

        train_loss_history.append(train_loss)
        valid_loss_history.append(valid_loss)

        train_accuracy_history.append(train_accuracy)
        valid_accuracy_history.append(valid_accuracy)
        
        if epoch % 25 == 0:
            torch.save(model.state_dict(), f'net_{epoch}')
        
        clear_output()

        plot_stats(
            train_loss_history, valid_loss_history,
            train_accuracy_history, valid_accuracy_history,
            title
        )
        scheduler.step()

from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CyclicLR
from torch.optim.lr_scheduler import StepLR

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, num_workers=2, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=2048, shuffle=True, num_workers=2, pin_memory=True)


net = Net(df_to_catboost.shape[1]).to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=1e-1)

#scheduler = CyclicLR(optimizer, base_lr=0.01, max_lr=0.1, cycle_momentum=False)
scheduler = StepLR(optimizer, step_size=50, gamma=0.1)

#loss_fn = nn.L1Loss()
loss_fn = nn.MSELoss() 
#loss_fn = nn.CrossEntropyLoss(reduction='sum')

whole_train_valid_cycle(net, 10, 'First simple model', train_loader, valid_loader)

torch.save(net.state_dict(), 'net01')

net

df_to_model.shape[1]

In [31]:
#df[~df['user_id'].isna()].dropna(axis=1)['user_id'].unique()

test_df = pd.concat([post_text, user_data])
test_user = test_df[test_df['user_id'] == 200].dropna(axis=1)
test_post = test_df[test_df['user_id'].isna()].dropna(axis=1)
#pd.concat([test_post.drop(['text', 'topic', 'post_id'], axis=1), test_user.drop('user_id', axis=1)], axis=0)
test_user = test_user.loc[test_user.index.repeat(test_post.shape[0])]
test_post.index = test_user.index
test_res = pd.concat([test_post.drop(['text', 'topic', 'post_id'], axis=1), test_user.drop('user_id', axis=1)], axis=1)
test_res.index = [i for i in range(test_res.shape[0])]
test_res.columns = [i for i in range(test_res.shape[1])]
test_res
#test_post.drop(['text', 'topic', 'post_id'], axis=1).shape, test_user.drop('user_id', axis=1).shape

catboost_model = CatBoostClassifier(
                           eval_metric="AUC",
                           task_type="GPU",
                           learning_rate=1e-1,
                           iterations=50,
                           l2_leaf_reg=50,
                           random_seed=432013,
                           od_type="Iter",
                           depth=5,
                           early_stopping_rounds=15000,
                           border_count=64,
                            metric_period= 250,
                            #cat_features=categorial_features
                           #has_time= True 
                          )

In [32]:
from sklearn.model_selection import train_test_split


#X_train, X_test, y_train, y_test = train_test_split(df_to_catboost, target, train_size=0.99999
                                                #    ,random_state=42
                                              #     ) 

#catboost_model = CatBoostClassifier( eval_metric="AUC", task_type="GPU", 
 #                                   learning_rate=6e-1, iterations=2000, l2_leaf_reg=50, 
  #                                  random_seed=432013, od_type="Iter", depth=5, 
   #                                 early_stopping_rounds=15000, border_count=64, 
    #                                metric_period= 250, #cat_features=categorial_features #has_time= True 
     #                              )

catboost_model = CatBoostClassifier()
catboost_model.fit(df_to_catboost, target)

Learning rate set to 0.141166
0:	learn: 0.6872912	total: 329ms	remaining: 5m 28s
1:	learn: 0.6832127	total: 603ms	remaining: 5m
2:	learn: 0.6799314	total: 875ms	remaining: 4m 50s
3:	learn: 0.6774606	total: 1.14s	remaining: 4m 43s
4:	learn: 0.6755291	total: 1.38s	remaining: 4m 35s
5:	learn: 0.6741242	total: 1.62s	remaining: 4m 28s
6:	learn: 0.6731242	total: 1.84s	remaining: 4m 21s
7:	learn: 0.6722479	total: 2.16s	remaining: 4m 28s
8:	learn: 0.6715149	total: 2.44s	remaining: 4m 28s
9:	learn: 0.6709331	total: 2.66s	remaining: 4m 23s
10:	learn: 0.6705141	total: 2.92s	remaining: 4m 22s
11:	learn: 0.6699333	total: 3.21s	remaining: 4m 24s
12:	learn: 0.6692499	total: 3.51s	remaining: 4m 26s
13:	learn: 0.6687647	total: 3.74s	remaining: 4m 23s
14:	learn: 0.6682445	total: 4.09s	remaining: 4m 28s
15:	learn: 0.6679012	total: 4.52s	remaining: 4m 37s
16:	learn: 0.6675896	total: 4.81s	remaining: 4m 38s
17:	learn: 0.6673030	total: 5.05s	remaining: 4m 35s
18:	learn: 0.6668063	total: 5.35s	remaining: 4m 

<catboost.core.CatBoostClassifier at 0x7fe7cf11e050>

In [33]:
catboost_model.save_model('catboost_model_32', format="cbm")

In [34]:
#catboost_model.shrink(428)
#catboost_model

In [35]:
catboost_model.predict_proba(df_to_catboost)[0]

array([0.03531905, 0.96468095])

In [36]:
#all_data = batch_load_sql_all_feed_data(f"SELECT * FROM public.feed_data LIMIT {feed_data_limit}", post_text, user_data)

In [37]:
torch.tensor(target.to_numpy().reshape(-1)).float()

tensor([1., 1., 0.,  ..., 1., 0., 1.])

engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )
pd.read_sql('SELECT * FROM "denis21.97@mail.ru_lesson_22" WHERE post_id IS NOT NULL', con=engine).dropna(axis=1)