In [1]:
import pandas as pd
import os.path
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
import plotly.express as px
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, median_absolute_error
import pickle
import numpy as np
from tqdm import tqdm

# Data preprocessing

In [2]:
DATABASE_URL = f"postgresql://postgres:admin@localhost:5433/postgres"
engine = create_engine(DATABASE_URL)

In [69]:
df = pd.read_sql("""SELECT * FROM ads WHERE created_at <= '2024-05-05'""", engine)

In [70]:
df['description'] = df['description'].str.replace('.', '').str.replace(',', '').str.replace('\n', ' ').fillna('None')


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [71]:
df.loc[df['currency'] != 'грн', 'price'] = df.loc[df['currency'] != 'грн', 'price'] * 40

## Train test split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['price']), df['price'], test_size=0.33, random_state=42,
                                                    stratify=df['source'])

In [73]:
url = 'https://raw.githubusercontent.com/olegdubetcky/Ukrainian-Stopwords/main/ukrainian'
r = requests.get(url)
with open('C:\\Users\\andry\\AppData\\Roaming\\nltk_data\\corpora\\stopwords\\ukrainian', 'wb') as f:
    f.write(r.content)

In [74]:
def preprocess_data(doc_set):
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    stop = set(stopwords.words('ukrainian')).union(set(stopwords.words('russian'))).union(set(['та']))
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens = [i for i in tokens if not i in stop]
        texts.append(tokens)
    return texts


In [84]:
res = []
for num in tqdm(range(5, 100, 5)):
    number_of_topics=num
    words=20
    clean_text=preprocess_data(X_train['description'])
    model = Word2Vec(clean_text, min_count=1, vector_size=number_of_topics, window=40, workers=8, epochs=50)
    X_train['clened_description'] = clean_text
    X_train_vector = pd.DataFrame(X_train['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0)).to_list())
    X_test['clened_description'] = preprocess_data(X_test['description'])
    X_test_vector = pd.DataFrame(X_test['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0)).to_list())
    lightgbm_model = LGBMRegressor().fit(X_train_vector, y_train)
    res.append(r2_score(y_test, lightgbm_model.predict(X_test_vector)))

  5%|▌         | 1/19 [00:07<02:20,  7.82s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 5
[LightGBM] [Info] Start training from score 41895.887600


 11%|█         | 2/19 [00:15<02:11,  7.76s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 10
[LightGBM] [Info] Start training from score 41895.887600


 16%|█▌        | 3/19 [00:23<02:07,  7.99s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 15
[LightGBM] [Info] Start training from score 41895.887600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 20
[LightGBM] [Info] Start training from score 41895.887600


 21%|██        | 4/19 [00:33<02:08,  8.55s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6375
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 25
[LightGBM] [Info] Start training from score 41895.887600


 32%|███▏      | 6/19 [00:53<02:01,  9.34s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 30
[LightGBM] [Info] Start training from score 41895.887600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8925
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 35
[LightGBM] [Info] Start training from score 41895.887600


 42%|████▏     | 8/19 [01:07<01:31,  8.27s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10200
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 40
[LightGBM] [Info] Start training from score 41895.887600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11475
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 45
[LightGBM] [Info] Start training from score 41895.887600


 47%|████▋     | 9/19 [01:16<01:22,  8.27s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 50
[LightGBM] [Info] Start training from score 41895.887600


 53%|█████▎    | 10/19 [01:25<01:16,  8.46s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14025
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 55
[LightGBM] [Info] Start training from score 41895.887600


 58%|█████▊    | 11/19 [01:35<01:11,  8.92s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 60
[LightGBM] [Info] Start training from score 41895.887600


 63%|██████▎   | 12/19 [01:45<01:05,  9.40s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16575
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 65
[LightGBM] [Info] Start training from score 41895.887600


 68%|██████▊   | 13/19 [01:53<00:53,  8.85s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17850
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 70
[LightGBM] [Info] Start training from score 41895.887600


 74%|███████▎  | 14/19 [02:01<00:44,  8.86s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19125
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 75
[LightGBM] [Info] Start training from score 41895.887600


 79%|███████▉  | 15/19 [02:13<00:38,  9.55s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20400
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 80
[LightGBM] [Info] Start training from score 41895.887600


 84%|████████▍ | 16/19 [02:23<00:29,  9.81s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 85
[LightGBM] [Info] Start training from score 41895.887600


 89%|████████▉ | 17/19 [02:35<00:20, 10.32s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22950
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 90
[LightGBM] [Info] Start training from score 41895.887600


 95%|█████████▍| 18/19 [02:49<00:11, 11.43s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24225
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 95
[LightGBM] [Info] Start training from score 41895.887600


100%|██████████| 19/19 [03:02<00:00,  9.63s/it]


In [85]:
px.line(x=range(5, 100, 5), y=res, template='plotly')

In [80]:
res = []
for num in tqdm(range(5, 100, 10)):
    number_of_topics=13
    clean_text=preprocess_data(X_train['description'])
    model = Word2Vec(clean_text, min_count=1, vector_size=number_of_topics, window=num, workers=8, epochs=50)
    X_train['clened_description'] = clean_text
    X_train_vector = pd.DataFrame(X_train['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0)).to_list())
    X_test['clened_description'] = preprocess_data(X_test['description'])
    X_test_vector = X_test['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0))
    X_test_vector.loc[X_test_vector.isna()] = X_test_vector.loc[X_test_vector.isna()] .map(lambda x: model.wv['квартира'])
    X_test_vector = pd.DataFrame(X_test_vector.to_list())
    lightgbm_model = LGBMRegressor().fit(X_train_vector, y_train)
    res.append(r2_score(y_test, lightgbm_model.predict(X_test_vector)))

 10%|█         | 1/10 [00:05<00:45,  5.00s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 20%|██        | 2/10 [00:12<00:50,  6.37s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 30%|███       | 3/10 [00:18<00:44,  6.32s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 40%|████      | 4/10 [00:26<00:42,  7.13s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 50%|█████     | 5/10 [00:35<00:38,  7.72s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 60%|██████    | 6/10 [00:45<00:33,  8.33s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 70%|███████   | 7/10 [00:54<00:25,  8.55s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 80%|████████  | 8/10 [01:04<00:18,  9.17s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


 90%|█████████ | 9/10 [01:15<00:09,  9.62s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 13
[LightGBM] [Info] Start training from score 41895.887600


100%|██████████| 10/10 [01:29<00:00,  8.95s/it]


In [83]:
px.line(x=range(5, 100, 10), y=res, template='plotly' )

In [86]:

clean_text=preprocess_data(X_train['description'])
model = Word2Vec(clean_text, min_count=1, vector_size=30, window=55, workers=8, epochs=50)
X_train['clened_description'] = clean_text
X_train_vector = pd.DataFrame(X_train['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0)).to_list())
X_test['clened_description'] = preprocess_data(X_test['description'])


In [87]:
X_test_vector = X_test['clened_description'].apply(lambda text: np.mean([model.wv[word] for word in text if word in model.wv], axis=0))
X_test_vector.loc[X_test_vector.isna()] = X_test_vector.loc[X_test_vector.isna()] .map(lambda x: model.wv['квартира'])
X_test_vector = pd.DataFrame(X_test_vector.to_list())

In [88]:

lightgbm_model = LGBMRegressor().fit(X_train_vector, y_train)
r2_score(y_test, lightgbm_model.predict(X_test_vector))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 2758, number of used features: 30
[LightGBM] [Info] Start training from score 41895.887600


0.4665199515937605

In [89]:
with open('word2vec_model.pickle', 'wb') as f:
    pickle.dump(model, f)
