# Пример ноутбука для соревнования

### Импортируем необходимые библиотеки

In [None]:
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
%matplotlib inline

### Подгружаем и разархивируем данные для соревнования

In [None]:
%%capture
!wget https://www.dropbox.com/s/mn9geg9ac3n0i3o/gender_test_kaggle_sample_submission.csv
!wget https://www.dropbox.com/s/o7oe7yzcvvva9az/gender_train.csv
!wget https://www.dropbox.com/s/phbo4sgjsiqco8z/transactions.csv.zip

In [None]:
!unzip transactions.csv.zip

Archive:  transactions.csv.zip
  inflating: transactions.csv        


##### Проверяем, что скачали

In [None]:
!ls

gender_test_kaggle_sample_submission.csv  sample_data	    transactions.csv.zip
gender_train.csv			  transactions.csv


### Подготовка данных по транзакциям

#### Считываем файл

In [None]:
transactions_data = pd.read_csv('transactions.csv')

In [None]:
transactions_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3751083 entries, 0 to 3751082
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Unnamed: 0   int64  
 1   customer_id  int64  
 2   tr_datetime  object 
 3   mcc_code     int64  
 4   tr_type      int64  
 5   amount       float64
 6   term_id      object 
dtypes: float64(1), int64(4), object(2)
memory usage: 200.3+ MB


In [None]:
transactions_data.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
transactions_data.shape

(3751083, 6)

#### Информация по каждому столбцу

* **customer_id** - идентификатор клиента
* **tr_datetime** - день и время совершения транзакции (дни нумеруются с начала данных)
* **mcc_code** - mcc-код транзакции
* **tr_type** - тип транзакции
* **amount** - сумма транзакции в условных единицах; со знаком "+" — начисление средств клиенту (приходная транзакция), "-" — списание средств (расходная транзакция)
* **term_id**  - идентификатор терминала


In [None]:
transactions_data.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,
1,39026145,1 10:19:29,6011,7010,56147.89,
2,39026145,1 10:20:56,4829,2330,-56147.89,
3,39026145,1 10:39:54,5499,1010,-1392.47,
4,39026145,2 15:33:42,5499,1010,-920.83,


#### Уникальные типы транзакций

In [None]:
unique_codes = transactions_data['tr_type'].unique()

In [None]:
unique_codes

array([1030, 7010, 2330, 1010, 2010, 7020, 1110, 6110, 7014, 1100, 2020,
       2370, 7070, 7030, 7071, 2331, 4051, 6010, 2011, 4011, 2371, 4071,
       4110, 2110, 7031, 7074, 7011, 1200, 2320, 7021, 1210, 4010, 4041,
       6200, 6100, 2021, 2456, 4097, 2210, 4210, 2440, 7034, 4090, 4500,
       4031, 7024, 8145, 4100, 1410, 1510, 7035, 7075, 1310, 7041, 2460,
       2340, 6000, 4020, 1000, 7015, 6210, 4021, 2446, 4096, 2100, 7040,
       8100, 4061, 2341, 8146, 2200, 4200, 4035, 7025, 4045, 2000])

In [None]:
def get_codes_vector(data, unique_codes, code_column_name='tr_type'):
  codes_values = pd.Series([0] * len(unique_codes), index=unique_codes)
  codes_counts = data[code_column_name].value_counts()
  codes_values[codes_counts.index] = codes_counts
  return codes_values

In [None]:
get_codes_vector(transactions_data.iloc[:1000], unique_codes)

Unnamed: 0,0
1030,126
7010,18
2330,13
1010,516
2010,111
...,...
4200,0
4035,0
7025,0
4045,0


In [None]:
customer_transactions = transactions_data.groupby(['customer_id']).apply(lambda x:
  get_codes_vector(x, unique_codes))

  customer_transactions = transactions_data.groupby(['customer_id']).apply(lambda x:


In [None]:
customer_transactions.shape

(8400, 76)

#### Сгруппировали уникальные типы транзакций по каждому клиенту

In [None]:
customer_transactions.head()

Unnamed: 0_level_0,1030,7010,2330,1010,2010,7020,1110,6110,7014,1100,...,8100,4061,2341,8146,2200,4200,4035,7025,4045,2000
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22899,47,40,2,6,48,0,27,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28753,25,5,29,89,24,0,65,1,0,3,...,0,0,0,0,0,0,0,0,0,0
42096,94,2,22,299,184,0,83,0,0,50,...,0,0,0,0,0,0,0,0,0,0
49793,81,12,0,287,70,0,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50940,0,0,0,9,90,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Подготовка данных по гендеру

In [None]:
genders = pd.read_csv('gender_train.csv')

#### Посмотрим, сколько клиентов принадлежит каждому из двух классов

In [None]:
genders['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
0,3276
1,2604


In [None]:
genders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Unnamed: 0   5880 non-null   int64
 1   customer_id  5880 non-null   int64
 2   gender       5880 non-null   int64
dtypes: int64(3)
memory usage: 137.9 KB


In [None]:
genders.index = genders['customer_id']
genders.drop(columns=['customer_id', 'Unnamed: 0'], inplace=True)

In [None]:
genders.head()

Unnamed: 0_level_0,gender
customer_id,Unnamed: 1_level_1
57807356,1
31453935,0
48079721,0
39809041,0
99267661,1


In [None]:
genders.shape

(5880, 1)

### Подготовка данных для обучения

#### Теперь соединим таблицы, полученные на первом и втором шаге

In [None]:
training_data = genders.merge(customer_transactions, how='left', left_index=True, right_index=True)

In [None]:
training_data.head()

Unnamed: 0_level_0,gender,1030,7010,2330,1010,2010,7020,1110,6110,7014,...,8100,4061,2341,8146,2200,4200,4035,7025,4045,2000
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57807356,1,195,82,5,10,52,0,15,0,0,...,0,0,0,0,0,0,0,0,0,0
31453935,0,111,2,4,292,91,0,131,0,0,...,0,0,0,0,0,0,0,0,0,0
48079721,0,70,6,1,85,21,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
39809041,0,56,16,23,329,34,0,47,0,0,...,0,0,0,0,0,0,0,0,0,0
99267661,1,53,35,0,17,60,0,51,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Разделим выборку на обучающую и тестовую

Здесь наш таргет - gender, его мы и будем предсказывать

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
training_points, test_points, training_values, test_values = train_test_split(training_data.drop(columns='gender'),
                                                                              training_data[['gender']],
                                                                              test_size=0.4, random_state=0)

### Построим модель

Как пример возьмем Light Gradient Boosting Machine (lightgbm)

In [None]:
# build the lightgbm model
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
lgbm_classifier = lgb.LGBMClassifier()
lgbm_classifier.fit(training_points, training_values)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1581, number of negative: 1947
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2483
[LightGBM] [Info] Number of data points in the train set: 3528, number of used features: 53
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448129 -> initscore=-0.208232
[LightGBM] [Info] Start training from score -0.208232


In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(roc_auc_score(test_values, lgbm_classifier.predict_proba(test_points)[:, 1]))

0.5996298821610115


### Подготовим submition

In [None]:
test_data = pd.read_csv('gender_test_kaggle_sample_submission.csv')

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   customer_id  2520 non-null   int64  
 1   probability  2520 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 39.5 KB


In [None]:
test_data.index = test_data['customer_id']
test_data.drop(columns='customer_id', inplace=True)

In [None]:
customer_transactions.loc[test_data.index]

Unnamed: 0_level_0,1030,7010,2330,1010,2010,7020,1110,6110,7014,1100,...,8100,4061,2341,8146,2200,4200,4035,7025,4045,2000
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62533123,78,0,0,17,40,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23345904,37,29,0,21,141,0,66,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61855287,83,16,4,6,118,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26971094,10,1,1,55,81,0,99,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54669253,2,1,0,13,46,0,14,0,0,8,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43214184,19,11,4,18,27,0,6,0,0,1,...,0,0,0,0,0,0,0,0,0,0
53205452,1,1,0,2,40,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17515894,12,0,0,4,91,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10702887,30,4,0,57,22,0,25,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test_data['probability'] = lgbm_classifier.predict_proba(customer_transactions.loc[test_data.index])[:, 1]

In [None]:
test_data.head()

Unnamed: 0_level_0,probability
customer_id,Unnamed: 1_level_1
62533123,0.463447
23345904,0.516496
61855287,0.281282
26971094,0.893058
54669253,0.527847


In [None]:
test_data.to_csv('our_prediction.csv')

In [None]:
from google.colab import files

In [None]:
files.download('our_prediction.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### А теперь попробуйте улучшить скор :)