In [1]:
from pathlib import Path
import pandas as pd
from datetime import datetime
import csv

import matplotlib
import matplotlib.pyplot as plt

import random
random.seed(42)

%matplotlib inline

In [2]:
# !pip install chardet
import chardet

**Plan**
1. NaNs +
2. Customer outliers
3. *Column* in terms frequency and amount - outliers
4. How many goods of certain groups (of goods)
5. Descriptives (discount, good preferences, preferences of group of good, etc.)

**0. Read data**

In [2]:
root_dir = Path('data/STATS/stats_lab1_data')
!ls {root_dir}

 product_groups.csv		 products_20201101.txt
 product_groups.docx		 product_sum_20201001.txt
'products_20200901 (copy).csv'	 tmp.csv
 products_20200901.csv		 tmp.txt
 products_20200901.txt


In [3]:
product_groups, september, october, november = root_dir / 'product_groups.docx', \
                                                root_dir / 'products_20200901.txt', \
                                                root_dir / 'product_sum_20201001.txt', \
                                                root_dir / 'products_20201101.txt'
september_csv = root_dir / 'products_20200901.csv'

In [5]:
!head {september} -n 3

date;id_doc;id_order;id_card;id_tov;id_kontr;quantity;sum;is_green
2020-09-01 17:01:25;0E05D308-5CEC-EA11-B444-005056A7539A;0;8594499   ;52;271;1,000;108;0
2020-09-01 17:35:36;610205A1-61EC-EA11-B444-005056A7539A;0;1452388   ;52;271;1,000;108;0


In [6]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
columns = [s for s in 'date;id_doc;id_order;id_card;id_tov;id_kontr;quantity;sum;is_green'.split(';')]
print(columns)

['date', 'id_doc', 'id_order', 'id_card', 'id_tov', 'id_kontr', 'quantity', 'sum', 'is_green']


In [7]:
# ## !! Run this cell only once 
# # to convert txt to csv (it seems to speed up further file reading)

# with open(september, 'r') as in_file:
#     stripped = (line.strip() for line in in_file)
#     lines = (line.split(";") for line in stripped if line)
#     with open(september_csv, 'w') as out_file:
#         writer = csv.writer(out_file)
#         writer.writerow(())
#         writer.writerows(lines)

In [8]:
# whole data for september
whole_data = pd.read_csv(
    filepath_or_buffer=Path(september_csv),
    header=0
)

whole_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100858 entries, 0 to 42100857
Data columns (total 9 columns):
 #   Column    Dtype  
---  ------    -----  
 0   date      object 
 1   id_doc    object 
 2   id_order  int64  
 3   id_card   object 
 4   id_tov    int64  
 5   id_kontr  float64
 6   quantity  object 
 7   sum       object 
 8   is_green  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 2.8+ GB


In [9]:
df = whole_data.copy()
# df = df.merge(product_groups_df, on='id_tov')

In [10]:
df.dtypes

date         object
id_doc       object
id_order      int64
id_card      object
id_tov        int64
id_kontr    float64
quantity     object
sum          object
is_green      int64
dtype: object

In [11]:
len(df)

42100858

**1. Handle null values** 

In [12]:
df.isnull().values.any()

True

In [13]:
col_with_nulls = df.loc[:, df.isnull().any()].columns
col_with_nulls

Index(['id_kontr'], dtype='object')

In [14]:
df[df['id_kontr'].isnull()].head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
73,2020-09-01 18:23:24,DCD681FA-67EC-EA11-B444-005056A7539A,0,1452280,185,,1000,129,0
126,2020-09-01 18:45:27,1606023F-6BEC-EA11-B444-005056A7539A,0,4475695,372,,1000,82,0
127,2020-09-01 20:55:22,AB82A13B-7DEC-EA11-B444-005056A7539A,0,5167781,372,,1000,82,0
128,2020-09-01 20:45:49,AE21D94F-7BEC-EA11-B444-005056A7539A,0,9813162,372,,1000,82,0
199,2020-09-01 14:53:45,03E52BAA-4AEC-EA11-B444-005056A7539A,0,7632871,647,,1000,5,0


In [15]:
# for now treat any null values as potential indicators of outliers: 
# fill them with unrealistic values (other approach could be replacing nulls with the most frequent value)

null_fill_value = -9999
col = 'id_kontr'
df[col][df[col].isnull()] = null_fill_value
   
df[df['id_kontr'] == null_fill_value].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col].isnull()] = null_fill_value


Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
73,2020-09-01 18:23:24,DCD681FA-67EC-EA11-B444-005056A7539A,0,1452280,185,-9999.0,1000,129,0
126,2020-09-01 18:45:27,1606023F-6BEC-EA11-B444-005056A7539A,0,4475695,372,-9999.0,1000,82,0
127,2020-09-01 20:55:22,AB82A13B-7DEC-EA11-B444-005056A7539A,0,5167781,372,-9999.0,1000,82,0
128,2020-09-01 20:45:49,AE21D94F-7BEC-EA11-B444-005056A7539A,0,9813162,372,-9999.0,1000,82,0
199,2020-09-01 14:53:45,03E52BAA-4AEC-EA11-B444-005056A7539A,0,7632871,647,-9999.0,1000,5,0


**2. Identify customer outliers**
 - Bought too much (quantity)
 - Bought too often (same id_card several times a day etc.)
     - (Todo) Treat bills with alcohol (if timestep is the same then it is the same transaction, not a next one)

In [16]:
df['id_card'].nunique(), df['id_doc'].nunique()

(1358089, 7709739)

In [17]:
df

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271.0,1000,108,0
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271.0,1000,108,0
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271.0,1000,10795,0
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271.0,1000,108,0
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97.0,2000,88,0
...,...,...,...,...,...,...,...,...,...
42100853,2020-09-30 11:26:28,350DE9C7-F602-EB11-B444-005056A7539A,0,A465340,25292,17528.0,1000,130,0
42100854,2020-09-30 08:12:39,6D7ACBFB-DB02-EB11-B444-005056A7539A,0,A698682,30740,19188.0,2000,120,0
42100855,2020-09-30 09:54:24,F96D45E8-E902-EB11-B444-005056A7539A,0,7790662,30741,19188.0,1000,73,0
42100856,2020-09-30 12:00:04,12067064-FB02-EB11-B444-005056A7539A,0,2751739,34835,15916.0,1000,78,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100858 entries, 0 to 42100857
Data columns (total 9 columns):
 #   Column    Dtype  
---  ------    -----  
 0   date      object 
 1   id_doc    object 
 2   id_order  int64  
 3   id_card   object 
 4   id_tov    int64  
 5   id_kontr  float64
 6   quantity  object 
 7   sum       object 
 8   is_green  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 2.8+ GB


In [19]:
##df['id_card'] = df['id_card'].astype('int')  #not possible: letters present
df['id_kontr'] = df['id_kontr'].astype('int')
df['quantity'] = df['quantity'].str.replace(',', '.').astype('float')
df['sum'] = df['sum'].str.replace(',', '.').astype('float')
df['date'] = pd.to_datetime(df['date'])

In [20]:
df.describe()

Unnamed: 0,id_order,id_tov,id_kontr,quantity,sum,is_green
count,42100860.0,42100860.0,42100860.0,42100860.0,42100860.0,42100860.0
mean,776586.9,17630.64,10459.77,1.427065,114.9112,0.07614348
std,2032896.0,11032.26,8794.764,31.01411,107.2201,0.2652276
min,0.0,46.0,-9999.0,-2.755,-1022.11,0.0
25%,0.0,14526.0,385.0,1.0,48.0,0.0
50%,0.0,19435.0,13581.0,1.0,89.0,0.0
75%,0.0,24558.0,16304.0,1.0,150.11,0.0
max,6673056.0,38660.0,21943.0,5000.0,38400.0,1.0


In [21]:
neg_quantity = df[df['quantity'] < 0]
neg_quantity

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
4206093,2020-09-04 09:10:21,827B9591-75EE-EA11-B444-005056A7539A,0,7009929,23074,19269,-1.268,470.43,0
4206103,2020-09-04 09:09:24,F6807973-75EE-EA11-B444-005056A7539A,0,7707981,23074,19269,-1.268,470.43,0
4210123,2020-09-04 08:37:00,49C21AF5-70EE-EA11-B444-005056A7539A,0,5554550,23074,19269,-1.084,402.16,0
4218020,2020-09-04 08:43:37,B681FEE3-71EE-EA11-B444-005056A7539A,0,0643258,23074,19269,-0.001,0.37,0
4218022,2020-09-04 08:43:37,B681FEE3-71EE-EA11-B444-005056A7539A,0,0643258,23074,19269,-2.375,881.13,0
...,...,...,...,...,...,...,...,...,...
5382631,2020-09-04 09:13:44,8B5F8150-76EE-EA11-B444-005056A7539A,0,8593402,23074,19269,-0.692,256.73,0
5396781,2020-09-04 08:37:49,1CC214FB-70EE-EA11-B444-005056A7539A,0,4916790,23074,19269,-1.236,458.56,0
5438902,2020-09-04 09:28:21,E6F17E03-78EE-EA11-B444-005056A7539A,0,5931979,23074,19269,-0.572,212.21,0
5450158,2020-09-04 09:21:17,CD1BCB2C-77EE-EA11-B444-005056A7539A,0,3290674,23074,19269,-0.915,339.47,0


In [22]:
neg_sum = df[df['sum'] < 0]
neg_sum

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
4206092,2020-09-04 09:10:21,827B9591-75EE-EA11-B444-005056A7539A,0,7009929,23074,19269,1.268,-470.43,0
4206102,2020-09-04 09:09:24,F6807973-75EE-EA11-B444-005056A7539A,0,7707981,23074,19269,1.268,-470.43,0
4210124,2020-09-04 08:37:00,49C21AF5-70EE-EA11-B444-005056A7539A,0,5554550,23074,19269,1.084,-402.16,0
4218019,2020-09-04 08:43:37,B681FEE3-71EE-EA11-B444-005056A7539A,0,0643258,23074,19269,0.001,-0.37,0
4218021,2020-09-04 08:43:37,B681FEE3-71EE-EA11-B444-005056A7539A,0,0643258,23074,19269,2.375,-881.13,0
...,...,...,...,...,...,...,...,...,...
5396780,2020-09-04 08:37:49,1CC214FB-70EE-EA11-B444-005056A7539A,0,4916790,23074,19269,1.236,-458.56,0
5438901,2020-09-04 09:28:21,E6F17E03-78EE-EA11-B444-005056A7539A,0,5931979,23074,19269,0.572,-212.21,0
5450157,2020-09-04 09:21:17,CD1BCB2C-77EE-EA11-B444-005056A7539A,0,3290674,23074,19269,0.915,-339.47,0
5450159,2020-09-04 09:21:17,CD1BCB2C-77EE-EA11-B444-005056A7539A,0,3290674,23074,19269,0.915,-339.47,0


Negative values look strange... probably these are the outliers which are transactions between stores, or just noise. As there are quite few of them, let's just remove these samples.

In [24]:
df.drop(df[df['quantity'] < 0].index, inplace=True)
df.drop(df[df['sum'] < 0].index, inplace=True)

In [25]:
df

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.0,108.00,0
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271,1.0,108.00,0
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271,1.0,107.95,0
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271,1.0,108.00,0
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97,2.0,88.00,0
...,...,...,...,...,...,...,...,...,...
42100853,2020-09-30 11:26:28,350DE9C7-F602-EB11-B444-005056A7539A,0,A465340,25292,17528,1.0,130.00,0
42100854,2020-09-30 08:12:39,6D7ACBFB-DB02-EB11-B444-005056A7539A,0,A698682,30740,19188,2.0,120.00,0
42100855,2020-09-30 09:54:24,F96D45E8-E902-EB11-B444-005056A7539A,0,7790662,30741,19188,1.0,73.00,0
42100856,2020-09-30 12:00:04,12067064-FB02-EB11-B444-005056A7539A,0,2751739,34835,15916,1.0,78.00,0


Now let's look at the quantity of goods a user buys on average. 

We can eliminate the samples where the quantity is too large (transactions between shops). And the reasoning why we look at the mean values for a user (not simply eliminate large values) is that the transactions between shops may also have small quantity but we shouldn't take them into account when considering a user preferences etc. Thus, large mean value for quantity should indicate it is a shop transaction id card.

In [26]:
quantity_user = df.groupby(['id_card'], as_index=False)['quantity'].mean()
quantity_user = quantity_user.rename(columns={'quantity': 'user_mean_quantity'})
df = df.merge(quantity_user, on='id_card')
quantity_user

Unnamed: 0,id_card,user_mean_quantity
0,0000000,1.142500
1,0000001,1.000000
2,0000003,1.333333
3,0000009,1.148765
4,0000015,1.081333
...,...,...
1358084,c539320,1.141143
1358085,c551708,1.523333
1358086,c633515,1.110969
1358087,c679536,1.031702


In [28]:
quantity_user.describe()

Unnamed: 0,user_mean_quantity
count,1358089.0
mean,1.403017
std,11.89936
min,0.034
25%,1.0
50%,1.0535
75%,1.25
max,2500.5


In [30]:
max_quantity_quantile = 0.997
shop_transactions = df[df['user_mean_quantity'] > df['user_mean_quantity'].quantile(max_quantity_quantile)]
quantity_threshold = shop_transactions['user_mean_quantity'].min()
quantity_threshold

21.074119999999997

I personally rarely buy more than 21 items... I guess it may also be considered as outliers (transactions with large quantity).

There are quite a lot of such samples though, so let's remove them from here but store them in a separate dataframe *shop_transactions* (in case it is useful later)

In [31]:
shop_transactions

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,user_mean_quantity
3237,2020-09-01 20:46:41,245BD476-7BEC-EA11-B444-005056A7539A,0,3004195,158,15385,1.000,57.00,0,32.443039
3238,2020-09-01 20:46:41,245BD476-7BEC-EA11-B444-005056A7539A,0,3004195,619,16860,0.478,34.42,0,32.443039
3239,2020-09-01 20:46:41,245BD476-7BEC-EA11-B444-005056A7539A,0,3004195,18735,16860,1.036,67.34,0,32.443039
3240,2020-09-01 20:46:41,245BD476-7BEC-EA11-B444-005056A7539A,0,3004195,18796,13887,1.000,65.00,0,32.443039
3241,2020-09-01 20:46:41,245BD476-7BEC-EA11-B444-005056A7539A,0,3004195,21005,19687,0.158,37.13,0,32.443039
...,...,...,...,...,...,...,...,...,...,...
42091310,2020-09-30 09:29:06,0B649B87-E602-EB11-B444-005056A7539A,0,6738849,731,17268,0.905,79.64,0,286.435000
42091311,2020-09-30 09:29:06,0B649B87-E602-EB11-B444-005056A7539A,0,6738849,1113,-9999,2000.000,0.00,0,286.435000
42091312,2020-09-30 09:29:06,0B649B87-E602-EB11-B444-005056A7539A,0,6738849,15738,17616,0.760,149.72,0,286.435000
42091313,2020-09-30 09:29:06,0B649B87-E602-EB11-B444-005056A7539A,0,6738849,15917,16612,0.680,87.04,0,286.435000


In [32]:
df.drop(df[df['user_mean_quantity'] > quantity_threshold].index, inplace=True)

In [40]:
quantity_bill = df.groupby(['id_doc'], as_index=False)['quantity'].mean()
quantity_bill = quantity_bill.rename(columns={'quantity': 'doc_mean_quantity'})
df = df.merge(quantity_bill, on='id_doc')
quantity_bill

Unnamed: 0,id_doc,doc_mean_quantity
0,00000230-16FC-EA11-B444-005056A7539A,1.500000
1,00000292-91FE-EA11-B444-005056A7539A,1.117647
2,00000450-F1FC-EA11-B444-005056A7539A,1.000000
3,00000A30-E700-EB11-B444-005056A7539A,1.000000
4,00000C88-A3F6-EA11-B444-005056A7539A,1.033857
...,...,...
7679161,FFFFFA98-0AFC-EA11-B444-005056A7539A,1.000000
7679162,FFFFFBDD-1FF5-EA11-B444-005056A7539A,1.000000
7679163,FFFFFE9E-DAEC-EA11-B444-005056A7539A,1.800000
7679164,FFFFFEBE-FE09-41C5-A140-507AC54850D7,1.673222


In [41]:
quantity_bill.describe()

Unnamed: 0,doc_mean_quantity
count,7679166.0
mean,1.231915
std,4.566324
min,0.0
25%,1.0
50%,1.0
75%,1.25
max,2500.5


In [47]:
max_doc_quantity_quantile = 0.9998
shop_transactions = shop_transactions.append(df[df['doc_mean_quantity'] > df['doc_mean_quantity'].quantile(max_doc_quantity_quantile)])

doc_quantity_threshold = shop_transactions['doc_mean_quantity'].min()
doc_quantity_threshold

18.0

In [49]:
df.drop(df[df['doc_mean_quantity'] > doc_quantity_threshold].index, inplace=True)

In [50]:
df

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,user_mean_quantity,doc_mean_quantity
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.000,108.00,0,0.984667,0.931
1,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,647,-9999,1.000,5.00,0,0.984667,0.931
2,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,15044,17441,2.650,71.55,0,0.984667,0.931
3,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,18257,15829,0.406,44.66,0,0.984667,0.931
4,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,22920,16205,1.000,318.00,0,0.984667,0.931
...,...,...,...,...,...,...,...,...,...,...,...
41974668,2020-09-30 09:37:16,F3CD288E-E702-EB11-B444-005056A7539A,0,3189952,34835,15916,1.000,78.00,0,0.780000,0.780
41974669,2020-09-30 16:09:58,C150BA5C-1E03-EB11-B444-005056A7539A,0,C526660,16319,102,1.000,52.00,0,1.000000,1.000
41974670,2020-09-30 12:48:53,43CF9336-0203-EB11-B444-005056A7539A,0,7790662,22639,12970,1.000,188.00,0,1.000000,1.000
41974671,2020-09-30 09:54:24,F96D45E8-E902-EB11-B444-005056A7539A,0,7790662,30741,19188,1.000,73.00,0,1.000000,1.000


In [51]:
df.describe()

Unnamed: 0,id_order,id_tov,id_kontr,quantity,sum,is_green,user_mean_quantity,doc_mean_quantity
count,41966380.0,41966380.0,41966380.0,41966380.0,41966380.0,41966380.0,41966380.0,41966380.0
mean,777832.5,17631.16,10459.65,1.166921,114.9443,0.0757824,1.191966,1.166921
std,2034279.0,11031.38,8793.696,0.8485783,107.0363,0.2646496,0.5942443,0.435026
min,0.0,46.0,-9999.0,0.0,0.0,0.0,0.034,0.0
25%,0.0,14526.0,385.0,1.0,48.0,0.0,1.0052,0.9872
50%,0.0,19435.0,13581.0,1.0,89.0,0.0,1.112942,1.04896
75%,0.0,24558.0,16304.0,1.0,150.37,0.0,1.259847,1.272727
max,6673056.0,38660.0,21943.0,500.0,38400.0,1.0,21.07412,18.0


In [52]:
df[df.quantity > quantity_threshold]

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,user_mean_quantity,doc_mean_quantity
15990,2020-09-08 21:42:44,ABD8F5CD-04F2-EA11-B444-005056A7539A,0,0869878,1113,-9999,100.0,0.0,0,1.842923,13.375000
229459,2020-09-18 16:43:22,C06A8E37-B5F9-EA11-B444-005056A7539A,0,6317441,23209,16887,25.0,900.0,0,1.535202,2.458129
238726,2020-09-01 21:22:46,73D4665A-80EC-EA11-B444-005056A7539A,0,5188648,23209,16887,29.0,1044.0,0,1.986212,15.000000
249467,2020-09-26 14:31:45,DE5ECA39-ECFF-EA11-B444-005056A7539A,0,5887055,25150,17639,25.0,550.0,0,1.765458,13.000000
257575,2020-09-21 18:53:54,2EDB56E6-23FC-EA11-B444-005056A7539A,0,0509972,20497,14557,36.0,720.0,0,1.524260,13.666667
...,...,...,...,...,...,...,...,...,...,...,...
41829400,2020-09-29 23:02:00,57530DBE-8E02-EB11-B444-005056A7539A,0,4079924,16842,12853,24.0,456.0,0,6.200000,6.200000
41920430,2020-09-30 08:42:42,43EADBF6-DF02-EB11-B444-005056A7539A,6618085,4211919,12210,122,25.0,2150.0,0,13.250000,13.250000
41920431,2020-09-30 08:42:42,43EADBF6-DF02-EB11-B444-005056A7539A,6618085,4211919,20494,14557,25.0,575.0,0,13.250000,13.250000
41935434,2020-09-30 15:32:35,A53806AF-1903-EB11-B444-005056A7539A,0,0206107,14472,14211,27.0,1539.0,0,11.800000,11.800000


In [53]:
shop_transactions = shop_transactions.append(df[df['quantity'] > quantity_threshold)
df.drop(df[df['quantity'] > quantity_threshold].index, inplace=True)

In [54]:
df.describe()

Unnamed: 0,id_order,id_tov,id_kontr,quantity,sum,is_green,user_mean_quantity,doc_mean_quantity
count,41964350.0,41964350.0,41964350.0,41964350.0,41964350.0,41964350.0,41964350.0,41964350.0
mean,777780.6,17631.14,10459.8,1.165324,114.9117,0.07578605,1.191873,1.166571
std,2034220.0,11031.49,8793.551,0.7899503,106.8491,0.2646555,0.5937848,0.4310071
min,0.0,46.0,-9999.0,0.0,0.0,0.0,0.034,0.0
25%,0.0,14526.0,385.0,1.0,48.0,0.0,1.0052,0.9871818
50%,0.0,19435.0,13581.0,1.0,89.0,0.0,1.112931,1.048923
75%,0.0,24558.0,16304.0,1.0,150.33,0.0,1.259796,1.272727
max,6673056.0,38660.0,21943.0,21.0,38400.0,1.0,21.07412,18.0


In [55]:
# df.to_csv('df_no_outliers.csv')