In [None]:
%cd drive/MyDrive/ml\ cc

/content/drive/.shortcut-targets-by-id/11EyRLAGq5qzkKriu7BOK1pyX5mUtKXsn/ml cc


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers
from tqdm import notebook
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

from tabulate import tabulate
from termcolor import colored

import itertools
inc = itertools.count().__next__

# nltk.download()
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
table_path = './AI_funding_rounds — копия.xlsx'
goal_column = 'Total Funding Amount Currency (in USD)'
list_col_for_drop = ['Transaction Name', 
                     'Transaction Name URL', 
                     'Organization Name', 
                     'Organization Name URL', 
                     'Funding Type', 
                     'Organization Website',
                     'Pre-Money Valuation',
                     'Pre-Money Valuation Currency',
                     'Pre-Money Valuation Currency (in USD)',
                     'Number of Partner Investors',
                     'Lead Investors',
                     'Investor Names',      
                     'Province',
                     'City',  
                     'Total Funding Amount Currency',
                     'Money Raised Currency',
                     'Country'
                     ]

In [None]:
def text2vec(df):
    df = df.fillna('unknown')

    lemmatizer = WordNetLemmatizer()

    def lemmatize(text):
        word_list = nltk.word_tokenize(text)
        lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])  
        return lemmatized_output

    def clear_text(text):
        text = re.sub(r"[^a-zA-Z']", ' ', text)
        return ' '.join(text.split()) 

    lemmatised = {key: df[key].progress_apply(lambda x: lemmatize(clear_text(x))) for key in list(df.columns)}
    
    count_tf_idf = TfidfVectorizer(stop_words = stopwords, max_features=128) 
    tf_idf = {key: count_tf_idf.fit_transform(lemmatised[key]) for key in list(df.columns)}
    prepared_text = pd.concat([pd.DataFrame(tf_idf[k].A) for k in tf_idf], axis=1)
    return prepared_text


def numeric2vec(df):
    # for c in list(df.columns):
        # df[c] = df[c].fillna(df[c].mode())
    df.fillna(df.mode().iloc[0], inplace=True)
    return df


def categorical2vec(df):
    # cat_data = init_table[potential_categorical_data_columns]
    df = df.fillna('unknown')

    # text_data = cat_data[text_columns_for_vectorising].copy()
    # cat_data = cat_data.drop(text_columns_for_vectorising, axis=1)

    # date_data = cat_data.pop('Announced Date')
    return pd.get_dummies(df)


def date2vec(df):
    names_date_to_func = {'day': lambda x: list(pd.DatetimeIndex(x).day),
                          'year': lambda x: list(pd.DatetimeIndex(x).year),
                          'month': lambda x: list(pd.DatetimeIndex(x).month)}

    list_keys = list(names_date_to_func.keys())

    date_prepared = pd.DataFrame(data={f'Announced Date: {key}': names_date_to_func[key](df) for key in list_keys})
    return date_prepared

In [None]:
def split_types_and_vectorize(df):

    cols = df.columns
    numeric_data_cols = df._get_numeric_data().columns
    potential_categorical_data_columns = list(set(cols) - set(numeric_data_cols))

    text_columns_for_vectorising = ['Organization Industries',
                                    'Organization Description']

    df_numeric = df[numeric_data_cols]

    df_categorical = init_table[potential_categorical_data_columns]
    df_text = df_categorical[text_columns_for_vectorising].copy()
    df_categorical = df_categorical.drop(text_columns_for_vectorising, axis=1)
    df_categorical.reset_index(inplace=True)

    df_date = df_categorical.pop('Announced Date')

    print("Prepare numeric data ...")
    df_numeric = numeric2vec(df_numeric)
    # print(df_numeric.isna().sum())

    print("Prepare categorical data ...")
    df_categorical = categorical2vec(df_categorical)
    # print(df_categorical.isna().sum())

    print("Prepare text data ...")
    df_text = text2vec(df_text)
    # print(df_text.isna().sum())

    print("Prepare date data ...")
    df_date = date2vec(df_date)
    # print(df_date.isna().sum())

    df_numeric.reset_index(inplace=True)
    df_categorical.reset_index(inplace=True)
    df_text.reset_index(inplace=True) 
    df_date.reset_index(inplace=True)

    return pd.concat([df_numeric, df_categorical, df_text, df_date], axis=1)

##Data preparation.

In [None]:
init_table = pd.read_excel(table_path)

init_table = init_table.dropna(subset=[goal_column])

# init_table = init_table.dropna()

init_table = init_table.drop(list_col_for_drop, axis=1)
prepared_table = split_types_and_vectorize(init_table)
# cols = init_table.columns
# numeric_data_cols = init_table._get_numeric_data().columns
# potential_categorical_data_columns = list(set(cols) - set(numeric_data_cols))

Prepare numeric data ...
Prepare categorical data ...
Prepare text data ...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


  0%|          | 0/30644 [00:00<?, ?it/s]

  0%|          | 0/30644 [00:00<?, ?it/s]

Prepare date data ...


In [None]:
# Columns to not rename
excluded = prepared_table.columns[~prepared_table.columns.duplicated(keep=False)]

# An incrementer
import itertools
inc = itertools.count().__next__

# A renamer
def ren(name):
    return f"{name}{inc()}" if name not in excluded else name

In [None]:
# Use inside rename()
prepared_table.rename(columns=ren)

Unnamed: 0,index0,Money Raised,Money Raised Currency (in USD),Total Funding Amount,Total Funding Amount Currency (in USD),Number of Funding Rounds,Number of Investors,level_0,index1,Equity Only Funding_No,...,122253,123254,124255,125256,126257,127258,index259,Announced Date: day,Announced Date: year,Announced Date: month
0,0,990000000.0,13355547.0,990000000.0,13355548.0,1,1.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,6,2021,1
1,1,13000000.0,13000000.0,13000000.0,13000000.0,1,1.0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,11,2019,1
2,3,2499979.0,2499979.0,8450600.0,8450600.0,3,1.0,2,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,15,2016,6
3,4,1525999.0,1525999.0,8450600.0,8450600.0,3,1.0,3,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,14,2015,8
4,5,1000000.0,1000000.0,10500000.0,10500000.0,4,2.0,4,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,12,2020,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30639,35111,50000.0,50000.0,65512.0,65512.0,2,1.0,30639,35111,0,...,0.0,0.0,0.0,0.0,0.0,0.0,30639,22,2017,2
30640,35112,3000000.0,440561.0,23000000.0,3677984.0,3,1.0,30640,35112,0,...,0.0,0.0,0.0,0.0,0.0,0.0,30640,1,2017,6
30641,35113,500000.0,500000.0,2400000.0,2400000.0,3,1.0,30641,35113,0,...,0.0,0.0,0.0,0.0,0.0,0.0,30641,30,2017,12
30642,35114,3000000.0,440561.0,13000000.0,2024021.0,2,1.0,30642,35114,0,...,0.0,0.0,0.0,0.0,0.0,0.0,30642,1,2017,6


In [None]:
y = prepared_table.pop(goal_column)
y.shape, prepared_table.shape

((30644,), (30644, 300))

In [None]:
prepared_table.isna().sum().sum()

0

In [None]:
X_init_core, X_init_test, y_core, y_test = train_test_split(prepared_table, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_init_core, y_core, test_size=0.25, random_state=42)

In [None]:
y_train


1250     10000000.0
16028     2500000.0
5278     66750000.0
15494     2600976.0
23948     7046920.0
            ...    
26116     1000000.0
15645     7500000.0
15073      500000.0
23305    35000000.0
30016    16050000.0
Name: Total Funding Amount Currency (in USD), Length: 18386, dtype: float64

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
models = [GradientBoostingRegressor(n_estimators=200), 
          ElasticNet(),
          SGDRegressor(),
          SVR(),
          BayesianRidge(),
        #   CatBoostRegressor(),
          LinearRegression(),
          LGBMRegressor()]

In [None]:
scores = {'explained_variance_score': explained_variance_score,
          'mean_absolute_error': mean_absolute_error,
          'mean_squared_error': mean_squared_error,
          'mean_absolute_percentage_error': mean_absolute_percentage_error}

scores_res = {}

In [None]:
for model in models:
    print(f'model {model.__class__.__name__} is fitting')
    model.fit(X_train, y_train)

    # print('validation ...')
    # scores_res[model.__class__.__name__] = {}
    # for sc in scores:
    #     y_pred = model.predict(X_valid)
    #     scores_res[model.__class__.__name__][sc] = scores[sc](y_true=y_valid, y_pred=y_pred)


In [None]:
for model in models:
    scores_res[model.__class__.__name__] = {}
    for sc in scores:
        y_pred = model.predict(X_valid)
        scores_res[model.__class__.__name__][sc] = scores[sc](y_true=y_valid, y_pred=y_pred)


In [None]:
print(tabulate([[colored(k, 'blue'), *[scores_res[k][w] for w in scores_res[k]]] for k in scores_res], 
               headers=[colored('model', 'blue', attrs=['bold']), *[colored(a, 'grey') for a in list(scores.keys())]]))

[1m[34mmodel[0m                        [30mexplained_variance_score[0m    [30mmean_absolute_error[0m    [30mmean_squared_error[0m    [30mmean_absolute_percentage_error[0m
-------------------------  --------------------------  ---------------------  --------------------  --------------------------------
[34mGradientBoostingRegressor[0m                 0.976831               6.64074e+06           2.33281e+15                       3.02201
[34mElasticNet[0m                                0.0698223              5.2742e+07            9.36186e+16                      52.333
[34mSGDRegressor[0m                             -6.90032e+35            2.23836e+25           6.99472e+52                       1.81266e+18
[34mSVR[0m                                       3.10778e-07            5.03124e+07           1.02784e+17                      15.465
[34mBayesianRidge[0m                             0.0338927              5.04217e+07           9.72332e+16                      46.2

In [None]:
scores_res_test = {}

for model in [models[0]]:
    scores_res_test[model.__class__.__name__] = {}
    for sc in scores:
        y_pred = model.predict(X_init_test)
        scores_res_test[model.__class__.__name__][sc] = scores[sc](y_true=y_test, y_pred=y_pred)

In [None]:
print(tabulate([[colored(k, 'blue'), *[scores_res_test[k][w] for w in scores_res_test[k]]] for k in scores_res_test], 
               headers=[colored('model', 'blue', attrs=['bold']), *[colored(a, 'grey') for a in list(scores.keys())]]))

[1m[34mmodel[0m                        [30mexplained_variance_score[0m    [30mmean_absolute_error[0m    [30mmean_squared_error[0m    [30mmean_absolute_percentage_error[0m
-------------------------  --------------------------  ---------------------  --------------------  --------------------------------
[34mGradientBoostingRegressor[0m                    0.992399             6.2788e+06           8.69887e+14                            3.4728
