In [2]:
#import library
import time
import numpy as np
import pandas as pd
import os

**1. Deal with large datasets and prepare a batch for training**

In [3]:
# To display all columns/rows

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth',None)

**Dealing with loading large files:-**

**- When dealing with large files the machine does not have enough memory to read the entire CSV into a DataFrame at one time. Assuming we do not need the entire dataset in memory all at one time, one way to avoid the problem would be to process the CSV in chunks (number of rows processed) of suitable size. Then use concat to get all the chunks.**

**- This function returns an iterator to iterate through these chunks and then wishfully processes them. Since only a part of a large file is read at once, low memory is enough to fit the data. Later, these chunks can be concatenated in a single dataframe.**

**- Here our data in such task contain date time column and we found pandas read it as object as defult where it is a datetime dtype and we found we reduct the size of data frame by 5 MB in addition we optimizing numeric columns with subtypes and optimizing object types using categoricalsas as we demostratethat as following**

In [4]:
#read with assign parse_dates prameter
start = time.time()
#read csv file 1
CustomersBuyPart1_withparse_dates=pd.read_csv('CustomersBuyPart1.csv',parse_dates=['retailweek'],infer_datetime_format=True)
#read csv file 2
CustomersBuyPart2_withparse_dates=pd.read_csv('CustomersBuyPart2.csv',parse_dates=['retailweek'],infer_datetime_format=True)
end = time.time()

print("Read csvs without chunks: ",(end-start),"sec")

Read csvs without chunks:  0.4232790470123291 sec


In [1]:
# #read with out assign parse_dates prameter
# start = time.time()
# CustomersBuyPart1_withOutparse_dates=pd.read_csv('CustomersBuyPart1.csv')
# CustomersBuyPart2_withOutparse_dates=pd.read_csv('CustomersBuyPart2.csv')
# end = time.time()
# print("Read csvs without chunks: ",(end-start),"sec")

In [6]:
#function compute the size in MB to df
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [7]:
mem_usage(CustomersBuyPart1_withparse_dates),mem_usage(CustomersBuyPart1_withOutparse_dates),

('28.99 MB', '31.41 MB')

In [8]:
mem_usage(CustomersBuyPart2_withparse_dates),mem_usage(CustomersBuyPart2_withOutparse_dates)

('38.42 MB', '41.63 MB')

In [9]:
all_df_withparse_dates=pd.concat([CustomersBuyPart1_withparse_dates,CustomersBuyPart2_withparse_dates],axis=0)
all_df_withoutparse_dates=pd.concat([CustomersBuyPart1_withOutparse_dates,CustomersBuyPart2_withOutparse_dates],axis=0)

In [10]:
mem_usage(all_df_withparse_dates),mem_usage(all_df_withoutparse_dates) #reduct 5 mega by using parse_dates prameter

('68.17 MB', '73.80 MB')

In [11]:
all_df_withoutparse_dates.info()  # retailweek feature is object type

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100001 entries, 0 to 56999
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   country         100001 non-null  object 
 1   article         100001 non-null  object 
 2   sales           100001 non-null  int64  
 3   regular_price   100001 non-null  float64
 4   current_price   100001 non-null  float64
 5   ratio           100001 non-null  float64
 6   retailweek      100001 non-null  object 
 7   promo1          100001 non-null  int64  
 8   promo2          100001 non-null  int64  
 9   customer_id     100001 non-null  float64
 10  article.1       100001 non-null  object 
 11  productgroup    100001 non-null  object 
 12  category        100001 non-null  object 
 13  cost            100001 non-null  float64
 14  style           100001 non-null  object 
 15  sizes           100001 non-null  object 
 16  gender          100001 non-null  object 
 17  rgb_r_main_

In [12]:
all_df_withparse_dates.info()  ## retailweek feature  become datetime64  as a type

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100001 entries, 0 to 56999
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   country         100001 non-null  object        
 1   article         100001 non-null  object        
 2   sales           100001 non-null  int64         
 3   regular_price   100001 non-null  float64       
 4   current_price   100001 non-null  float64       
 5   ratio           100001 non-null  float64       
 6   retailweek      100001 non-null  datetime64[ns]
 7   promo1          100001 non-null  int64         
 8   promo2          100001 non-null  int64         
 9   customer_id     100001 non-null  float64       
 10  article.1       100001 non-null  object        
 11  productgroup    100001 non-null  object        
 12  category        100001 non-null  object        
 13  cost            100001 non-null  float64       
 14  style           100001 non-null  obje

In [13]:
mylist=range(1,int(len(all_df_withparse_dates)/100),1)
start = time.time()
#read data in chunks of 100 rows at a time
chunk1 = pd.read_csv('CustomersBuyPart1.csv',chunksize=100,parse_dates=['retailweek'],infer_datetime_format=True)
chunk2 = pd.read_csv('CustomersBuyPart2.csv',chunksize=100,parse_dates=['retailweek'],infer_datetime_format=True)
end = time.time()
print("Read csv with chunks: ",(end-start),"sec")
CustomersBuyPart1_df = pd.concat(chunk1,keys=mylist)
CustomersBuyPart2_df = pd.concat(chunk2,keys=mylist)

Read csv with chunks:  0.01701068878173828 sec


In [20]:
all_df=pd.concat([CustomersBuyPart1_df,CustomersBuyPart2_df],axis=0)

In [21]:
all_df.head(10)

Unnamed: 0,Unnamed: 1,country,article,sales,regular_price,current_price,ratio,retailweek,promo1,promo2,customer_id,article.1,productgroup,category,cost,style,sizes,gender,rgb_r_main_col,rgb_g_main_col,rgb_b_main_col,rgb_r_sec_col,rgb_g_sec_col,rgb_b_sec_col,label
1,0,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,OC6355,SHOES,TRAINING,13.29,slim,"xxs,xs,s,m,l,xl,xxl",women,205,104,57,255,187,255,DidNotBuy
1,1,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,AP5568,SHORTS,TRAINING,2.29,regular,"xxs,xs,s,m,l,xl,xxl",women,188,238,104,255,187,255,DidNotBuy
1,2,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,CB8861,HARDWARE ACCESSORIES,GOLF,1.7,regular,"xxs,xs,s,m,l,xl,xxl",women,205,173,0,255,187,255,DidNotBuy
1,3,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,LI3529,SHOES,RUNNING,9.0,regular,"xxs,xs,s,m,l,xl,xxl",kids,205,140,149,164,211,238,DidNotBuy
1,4,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,GG8661,SHOES,RELAX CASUAL,9.6,regular,"xxs,xs,s,m,l,xl,xxl",women,138,43,226,164,211,238,DidNotBuy
1,5,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,TX1463,SWEATSHIRTS,TRAINING,4.2,wide,"xxs,xs,s,m,l,xl,xxl",women,79,148,205,164,211,238,DidBuy
1,6,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,PC6383,SHOES,FOOTBALL GENERIC,9.9,wide,"xs,s,m,l,xl",unisex,139,26,26,205,155,155,DidNotBuy
1,7,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,VT7698,SHOES,INDOOR,5.2,wide,"xxs,xs,s,m,l,xl,xxl",women,135,206,250,205,155,155,DidBuy
1,8,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,FG2965,HARDWARE ACCESSORIES,RUNNING,1.29,slim,"xxs,xs,s,m,l,xl,xxl",women,181,181,181,205,155,155,DidNotBuy
1,9,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,AC7347,SHOES,FOOTBALL GENERIC,8.7,regular,"xxs,xs,s,m,l,xl,xxl",men,139,137,137,205,155,155,DidBuy


In [23]:
all_df.index[:200]  #show the indexs as batch number for each 100 row

MultiIndex([(1,   0),
            (1,   1),
            (1,   2),
            (1,   3),
            (1,   4),
            (1,   5),
            (1,   6),
            (1,   7),
            (1,   8),
            (1,   9),
            ...
            (2, 190),
            (2, 191),
            (2, 192),
            (2, 193),
            (2, 194),
            (2, 195),
            (2, 196),
            (2, 197),
            (2, 198),
            (2, 199)],
           length=200)

In [24]:
mem_usage(all_df)

'70.94 MB'

In [25]:
#replce categoural data manual and check the size of data frame--->   {this step just for compare the result futher}
all_df_manual=all_df.copy()
all_df_manual.replace({"xs,s,m,l,xl": 0, "xxs,xs,s,m,l,xl,xxl": 1}, inplace=True)
all_df_manual.replace({"DidNotBuy": 0, "DidBuy": 1}, inplace=True)
all_df_manual.replace({"Germany": 0, "Austria": 1,"France":2}, inplace=True)
all_df_manual.replace({"women": 0, "kids": 1,"unisex":2,"men":3}, inplace=True)
all_df_manual.replace({"slim": 0, "regular": 1,"wide":2}, inplace=True)
all_df_manual.replace({"RELAX CASUAL": 0, "GOLF": 1,"FOOTBALL GENERIC":2,"RUNNING":3,"TRAINING":4,"INDOOR":5}, inplace=True)
all_df_manual.replace({"HARDWARE ACCESSORIES": 0, "SHOES": 1,"SWEATSHIRTS":2,"SHORTS":3}, inplace=True)
all_df_manual.replace({"GG8661": 0, "PC6383": 1,"CB8861":2,"FG2965":3,"TX1463": 4, "OC6355": 5,"AC7347":6,"LI3529":7, "OC6355": 5,"AP5568":8,"VT7698":9}, inplace=True)

In [26]:
all_df_manual=all_df_manual.drop(['article','sizes'],axis=1)  #just 
all_df_manual.head(10)

Unnamed: 0,Unnamed: 1,country,sales,regular_price,current_price,ratio,retailweek,promo1,promo2,customer_id,article.1,productgroup,category,cost,style,gender,rgb_r_main_col,rgb_g_main_col,rgb_b_main_col,rgb_r_sec_col,rgb_g_sec_col,rgb_b_sec_col,label
1,0,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,5,1,4,13.29,0,0,205,104,57,255,187,255,0
1,1,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,8,3,4,2.29,1,0,188,238,104,255,187,255,0
1,2,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,2,0,1,1.7,1,0,205,173,0,255,187,255,0
1,3,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,7,1,3,9.0,1,1,205,140,149,164,211,238,0
1,4,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,0,1,0,9.6,1,0,138,43,226,164,211,238,0
1,5,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,4,2,4,4.2,2,0,79,148,205,164,211,238,1
1,6,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,1,1,2,9.9,2,2,139,26,26,205,155,155,0
1,7,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,9,1,5,5.2,2,0,135,206,250,205,155,155,1
1,8,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,3,0,3,1.29,0,0,181,181,181,205,155,155,0
1,9,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,6,1,2,8.7,1,3,139,137,137,205,155,155,1


In [28]:
mem_usage(all_df_manual)

'20.32 MB'

In [29]:
all_df_dumm = pd.get_dummies(all_df)   #get one hot encding for categural feature and chech the size

In [30]:
mem_usage(all_df_dumm)

'63.71 MB'

In [31]:
all_df_dumm.head()

Unnamed: 0,Unnamed: 1,sales,regular_price,current_price,ratio,retailweek,promo1,promo2,customer_id,cost,rgb_r_main_col,rgb_g_main_col,rgb_b_main_col,rgb_r_sec_col,rgb_g_sec_col,rgb_b_sec_col,country_Austria,country_France,country_Germany,article_AA1821,article_AA7884,article_AA8941,article_AC7347,article_AD9697,article_AF5746,article_AH6675,article_AJ7542,article_AL2298,article_AL9977,article_AM4669,article_AN4895,article_AO8265,article_AP5568,article_AQ1643,article_AR1923,article_AR4473,article_AT7497,article_AU7641,article_AX5913,article_AX5971,article_AZ5221,article_AZ6626,article_BC1489,article_BC6932,article_BE2333,article_BE9148,article_BF7459,article_BF7554,article_BF9848,article_BH9952,article_BI5591,article_BI5643,article_BJ4373,article_BM9116,article_BR3179,article_BS7795,article_BU9681,article_BW2758,article_BX8284,article_BX9481,article_BY9685,article_BZ4828,article_BZ8791,article_CA2199,article_CA2479,article_CB4942,article_CB8861,article_CC8861,article_CF3238,article_CF4856,article_CH6937,article_CJ4578,article_CK7156,article_CL8759,article_CO7738,article_CQ8153,article_CR8478,article_CX1431,article_CY6963,article_DB3258,article_DD1361,article_DG7643,article_DH6848,article_DI9187,article_DK3634,article_DM6271,article_DM6477,article_DW2429,article_DW8683,article_DY1673,article_DZ3492,article_EA9617,article_EB5477,article_EC5317,article_EF2771,article_EF6812,article_EH5694,article_EI1264,article_EL3283,article_EL6462,article_EM9513,article_EN1199,article_EN9438,article_ET7242,article_EU1121,article_EZ3428,article_EZ8648,article_FB5424,article_FE2938,article_FE4648,article_FE6641,article_FE6662,article_FF7283,article_FG2965,article_FJ2121,article_FJ8179,article_FK6357,article_FK7423,article_FO4538,article_FP2228,article_FP7124,article_FS5149,article_FU5676,article_FV6234,article_FX1729,article_FY5273,article_GA4832,article_GB6449,article_GC8114,article_GD2286,article_GG3324,article_GG8661,article_GJ5184,article_GL8661,article_GP3497,article_GP6821,article_GR1127,article_GR3986,article_GS4461,article_GT2628,article_GT5685,article_GW8244,article_GZ1752,article_GZ5576,article_HB1693,article_HD1628,article_HJ9196,article_HM5731,article_HM8568,article_HN6759,article_HN7272,article_HN7357,article_HQ3171,article_HQ9691,article_HU6228,article_HW7772,article_HZ4826,article_HZ9888,article_IA4131,article_IB8671,article_IF7337,article_IH1672,article_IL7684,article_IM2273,article_IO7646,article_IQ1913,article_IR3275,article_IW7978,article_IW8485,article_JA4544,article_JB4241,article_JC1565,article_JC5886,article_JG1582,article_JG6384,article_JI2453,article_JK5796,article_JM7648,article_JN4924,article_JP9274,article_JQ8333,article_JR7981,article_JR8311,article_JW4878,article_JX7462,article_JY1298,article_JY1726,article_KE3772,article_KF6572,article_KF7125,article_KF7243,article_KI2338,article_KI5716,article_KJ7255,article_KJ9185,article_KL1526,article_KO9295,article_KT2132,article_KT8964,article_KT9618,article_KV2454,article_KV6219,article_KY7934,article_KZ9384,article_LB9256,article_LC1964,article_LD1896,article_LD8468,article_LG5858,article_LH8921,article_LI3529,article_LI5748,article_LI6472,article_LL3852,article_LL7287,article_LR5226,article_LT4238,article_LU3394,article_LU6658,article_LX1494,article_LX5583,article_LX5774,article_LY8874,article_MA7179,article_MC3398,article_MD2664,article_MG2169,article_MI6988,article_MJ2618,article_MK5273,article_ML2223,article_MM4542,article_MO9371,article_MP6772,article_MQ6248,article_MR4948,article_MW3528,article_MW7971,article_MW9292,article_MZ9561,article_NB5887,article_NE7168,article_NH7643,article_NH9366,article_NJ3895,article_NK3982,article_NK4915,article_NL2136,article_NM4424,article_NQ1161,article_NS7357,article_NT3648,article_NW3584,article_NY5159,article_NY5947,article_NY6781,article_OA8258,article_OC6355,article_OE7548,article_OF8158,article_OI4367,article_OJ4847,article_OK8155,article_ON4163,article_ON6325,article_ON6494,article_ON9331,article_OO1497,article_OP1184,article_OT2311,article_OU2254,article_OU5334,article_OV5561,article_OW5968,article_OY4474,article_OZ8992,article_PB1483,article_PC6383,article_PE2872,article_PE5968,article_PF5685,article_PH9161,article_PL6969,article_PN1714,article_PP8845,article_PQ4964,article_PQ6379,article_PQ6773,article_PQ6953,article_PT2992,article_PU1185,article_PV1343,article_PV4787,article_PV7587,article_PW6278,article_PW7632,article_PY1419,article_PY1913,article_PY2718,article_PZ7731,article_QB1247,article_QB6977,article_QC7465,article_QD2412,article_QD9777,article_QG3131,article_QK7994,article_QL6154,article_QM3774,article_QO5375,article_QO7834,article_QO8312,article_QP2819,article_QS1816,article_QS5396,article_QT2338,article_QT7325,article_QU7755,article_QV8877,article_QX5316,article_RC5832,article_RE3197,article_RE8165,article_RE8863,article_RF2926,article_RF6397,article_RF6881,article_RH5979,article_RJ3725,article_RJ5552,article_RN4195,article_RN5619,article_RN7483,article_RO5412,article_RP9222,article_RS3662,article_RT6283,article_RV9228,article_RX1584,article_RX4112,article_SA2925,article_SC5839,article_SE2934,article_SF1988,article_SG5828,article_SG6172,article_SH7883,article_SJ4545,article_SL9748,article_SO4773,article_SP6977,article_ST3419,article_SV7732,article_SW2464,article_SW4387,article_SW7987,article_TA7629,article_TC9631,article_TJ1277,article_TK4862,article_TL9924,article_TM4166,article_TN5256,article_TN7113,article_TO2769,article_TO8135,article_TR1972,article_TS8227,article_TS8795,article_TS8911,article_TW8762,article_TX1463,article_TX3691,article_TX8432,article_TY9287,article_UB1117,article_UD3728,article_UG2991,article_UG4425,article_UJ4517,article_UM7314,article_UN9356,article_UR7332,article_UV9411,article_UX6816,article_UX6851,article_VA9789,article_VC4517,article_VD4566,article_VE4993,article_VF6733,article_VF7316,article_VG1586,article_VH1588,article_VJ8341,article_VK4838,article_VK5535,article_VL9749,article_VM7772,article_VP7827,article_VR2932,article_VS2118,article_VS6613,article_VT1698,article_VT3516,article_VT7698,article_VU8833,article_VW8489,article_VW9933,article_VX6536,article_VX8496,article_VY6942,article_VY8356,article_VY8476,article_WB3723,article_WB3769,article_WB8526,article_WC1828,article_WE4646,article_WF4276,article_WJ9718,article_WK5365,article_WL2581,article_WM7783,article_WO1329,article_WP4135,article_WP4574,article_WQ8254,article_WR9459,article_WT9578,article_WU2517,article_WV8337,article_WZ7972,article_XB1815,article_XB3134,article_XC9518,article_XF3362,article_XF4182,article_XF4642,article_XG3252,article_XG6147,article_XG6449,article_XH3727,article_XH6675,article_XI2814,article_XI2961,article_XI5411,article_XJ1725,article_XK6846,article_XK8557,article_XN6238,article_XO7333,article_XR5464,article_XS4279,article_XT5836,article_XU7827,article_XU9926,article_YD2684,article_YG9479,article_YI3589,article_YK5786,article_YL7926,article_YN2747,article_YN8639,article_YR2438,article_YR6479,article_YS6935,article_YS9175,article_YV2782,article_YV6825,article_YV7315,article_YX1723,article_YX2167,article_ZB7415,article_ZC7213,article_ZD3611,article_ZE6328,article_ZE9366,article_ZF7765,article_ZI6739,article_ZJ5718,article_ZK3537,article_ZK4922,article_ZM8792,article_ZO6398,article_ZR3493,article_ZR8112,article_ZS4134,article_ZT1211,article_ZU2733,article_ZU5523,article_ZV2187,article_ZW6694,article_ZX2294,article_ZX8794,article_ZZ2466,article.1_AC7347,article.1_AP5568,article.1_CB8861,article.1_FG2965,article.1_GG8661,article.1_LI3529,article.1_OC6355,article.1_PC6383,article.1_TX1463,article.1_VT7698,productgroup_HARDWARE ACCESSORIES,productgroup_SHOES,productgroup_SHORTS,productgroup_SWEATSHIRTS,category_FOOTBALL GENERIC,category_GOLF,category_INDOOR,category_RELAX CASUAL,category_RUNNING,category_TRAINING,style_regular,style_slim,style_wide,"sizes_xs,s,m,l,xl","sizes_xxs,xs,s,m,l,xl,xxl",gender_kids,gender_men,gender_unisex,gender_women,label_DidBuy,label_DidNotBuy
1,0,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,13.29,205,104,57,255,187,255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1
1,1,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,2.29,188,238,104,255,187,255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,1
1,2,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,1.7,205,173,0,255,187,255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1
1,3,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,9.0,205,140,149,164,211,238,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,1
1,4,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,9.6,138,43,226,164,211,238,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1


In [33]:
#Optimizing Numeric Columns with Subtypes

all_df_int = all_df.select_dtypes(include=['int'])
converted_int = all_df_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(all_df_int))
print(mem_usage(converted_int))

3.53 MB
3.53 MB


In [34]:
#Optimizing  Columns with Subtypes
all_df_float = all_df.select_dtypes(include=['float'])
converted_float = all_df_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(all_df_float))
print(mem_usage(converted_float))   #reduct by 2 MB

7.35 MB
5.44 MB


In [35]:
#Optimizing object types using categoricals
all_df_obj = all_df.select_dtypes(include=['object'])
all_df_obj.describe()

Unnamed: 0,country,article,article.1,productgroup,category,style,sizes,gender,label
count,100001,100001,100001,100001,100001,100001,100001,100001,100001
unique,3,477,10,4,6,3,2,4,2
top,Germany,BR3179,OC6355,SHOES,TRAINING,regular,"xxs,xs,s,m,l,xl,xxl",women,DidNotBuy
freq,49401,610,10001,60001,30001,50000,90001,70001,86073


In [36]:
#category type uses integer values under the hood to represent the values in a column, rather than the raw values. 
#Pandas uses a separate mapping dictionary that maps the integer values to the raw ones.
#This arrangement is useful whenever a column contains a limited set of values. When we convert a column to the 
#category dtype, pandas uses the most space efficient int subtype that can represent all of the unique
converted_obj = pd.DataFrame()
for col in all_df_obj.columns:
    num_unique_values = len(all_df_obj[col].unique())
    num_total_values = len(all_df_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = all_df_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = all_df_obj[col]
print(mem_usage(all_df_obj))
print(mem_usage(converted_obj))       #reduct by about 8 times 

  res = shell.run_cell(code, store_history=store_history, silent=silent)


62.00 MB
7.04 MB


In [37]:
converted_obj.head()

Unnamed: 0,Unnamed: 1,country,article,article.1,productgroup,category,style,sizes,gender,label
1,0,Germany,YN8639,OC6355,SHOES,TRAINING,slim,"xxs,xs,s,m,l,xl,xxl",women,DidNotBuy
1,1,Germany,YN8639,AP5568,SHORTS,TRAINING,regular,"xxs,xs,s,m,l,xl,xxl",women,DidNotBuy
1,2,Germany,YN8639,CB8861,HARDWARE ACCESSORIES,GOLF,regular,"xxs,xs,s,m,l,xl,xxl",women,DidNotBuy
1,3,Germany,YN8639,LI3529,SHOES,RUNNING,regular,"xxs,xs,s,m,l,xl,xxl",kids,DidNotBuy
1,4,Germany,YN8639,GG8661,SHOES,RELAX CASUAL,regular,"xxs,xs,s,m,l,xl,xxl",women,DidNotBuy


In [74]:
#This Function load and optimize large files in terms size and time for CSVs files by taken path 
path='C://Users//Mohammed//Desktop//VATASk'
files = os.listdir(path)    
files = list(filter(lambda f: f.endswith('.csv'), files))  #list of csv files in path
 
def LoadOptLargeInPatch(batchSize,labelColName,files=files):   #return optimized data frame after processing in terms the size
    #mylist=range(1,int(len(all_df)/batch),1)
    mylist=range(1,10000000,1)
    chunks=[]
    dfs=[]
    for i in range(len(files)):
        chunks.append(pd.read_csv(files[i],chunksize=batchSize,parse_dates=['retailweek'],infer_datetime_format=True))
    for j in range(len(chunks)):
        dfs.append(pd.concat(chunks[j],keys=mylist))
    all_df = pd.concat(dfs,axis=0)
    #Optimizing Numeric Columns with Subtypes
    all_df_int = all_df.select_dtypes(include=['int'])
    converted_int = all_df_int.apply(pd.to_numeric,downcast='unsigned')
    all_df_float = all_df.select_dtypes(include=['float'])
    converted_float = all_df_float.apply(pd.to_numeric,downcast='float')
    all_df_obj = all_df.select_dtypes(include=['object'])
    converted_obj = pd.DataFrame()
    #Optimizing object types using categoricals
    for col in all_df_obj.columns:
        num_unique_values = len(all_df_obj[col].unique())
        num_total_values = len(all_df_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = all_df_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = all_df_obj[col]
    del converted_obj[str(labelColName)]      #delet label encoding to apply hot encoding separately to  be as needed
    optimized_all_df = all_df.copy()
    del optimized_all_df[str(labelColName)]    #delet label encoding to apply hot encoding separately to be as needed
    optimized_all_df[converted_int.columns] = converted_int
    optimized_all_df[converted_float.columns] = converted_float
    for i in range(len(converted_obj.columns)):
        converted_obj[(converted_obj.columns[i])]= converted_obj[(converted_obj.columns[i])].cat.codes
    
    optimized_all_df[converted_obj.columns] = converted_obj
    #X=optimized_all_df.drop(['label'],axis=1)
    #X=pd.get_dummies(X)
    y=all_df['label']
    y=pd.get_dummies(y)
   
    return optimized_all_df,y,mem_usage(optimized_all_df),mem_usage(y)

In [75]:
X,y,SizeOptdf,SizeY=LoadOptLargeInPatch(batchSize=100,labelColName='label')
SizeOptdf,SizeY

  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)


('13.93 MB', '3.72 MB')

In [76]:
X.head(200)  #show 200 rows or 2 batch

Unnamed: 0,Unnamed: 1,country,article,sales,regular_price,current_price,ratio,retailweek,promo1,promo2,customer_id,article.1,productgroup,category,cost,style,sizes,gender,rgb_r_main_col,rgb_g_main_col,rgb_b_main_col,rgb_r_sec_col,rgb_g_sec_col,rgb_b_sec_col
1,0,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,6,1,5,13.29,1,1,3,205,104,57,255,187,255
1,1,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,1,2,5,2.29,0,1,3,188,238,104,255,187,255
1,2,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,2,0,1,1.7,0,1,3,205,173,0,255,187,255
1,3,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,5,1,4,9.0,0,1,0,205,140,149,164,211,238
1,4,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,4,1,3,9.6,0,1,3,138,43,226,164,211,238
1,5,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,8,3,5,4.2,2,1,3,79,148,205,164,211,238
1,6,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,7,1,0,9.9,2,0,2,139,26,26,205,155,155
1,7,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,9,1,2,5.2,2,1,3,135,206,250,205,155,155
1,8,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,3,0,4,1.29,1,1,3,181,181,181,205,155,155
1,9,2,444,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,0,1,0,8.7,0,1,1,139,137,137,205,155,155


In [77]:
y.head(200) #show 200 rows or 2 batch

Unnamed: 0,Unnamed: 1,DidBuy,DidNotBuy
1,0,0,1
1,1,0,1
1,2,0,1
1,3,0,1
1,4,0,1
1,5,1,0
1,6,0,1
1,7,1,0
1,8,0,1
1,9,1,0


**2. Cohen's kappa Test between two raters.**

**Evaluating Cohen’s Kappa**

**The value for kappa can be less than 0 (negative). A score of 0 means that there is random agreement among raters, whereas a score of 1 means that there is a complete agreement between the raters. Therefore, a score that is less than 0 means that there is less agreement than random chance.**

In [60]:
cohenkappa_df=pd.read_csv('cohenkappa/cohenkappa.csv')
cohenkappa_df.head(20)

Unnamed: 0,Annotator 1,Annotator 2,Text
0,Positive,Positive,I am voting early and in person.\n\nWhat’s your voting plan?
1,Positive,Neutral,Hey there - this event and my involvement was presented to my team differently from how it’s now being promoted.\n\nThanks for pointing it out. Taking a look into this now.
2,Positive,Neutral,"There’s no questionnaire about symptoms, no requirement to report to the House physician if we are experiencing symptoms, no temperature check upon arrival to DC, nothing.\n\nThen we all get on planes and travel back to our communities, often twice a week. It’s dangerous."
3,Positive,Positive,"RT : SKIMS is launching at this Monday, October 5 in London, Manchester, and Birmingham!\n\nBe the first to shop and experi…"
4,Negative,Positive,Yes! I am going to start studying soon 📚 https://t.co/HiLMVbNNjI
5,Neutral,Positive,"Money doesn’t have to buy your vote.\n\nEach time you look past the flood of TV commercials, scammy Facebook posts and angry mailers to verify the claims you see &amp; think critically, you help fight the corrupting tide of money in politics.\n\nHere’s how ⬇️\nhttps://t.co/TkHTr9zr7C"
6,Positive,Negative,"The HEROES Act the House passed months ago covers several (but not all) of these points.\n\nGOP Senators, pledging allegiance to Mitch McConnell, have intentionally sat on this bill &amp; did nothing.\n\nPeople are starving. Babies are sleeping on car floors. Where is the needed urgency?"
7,Positive,Neutral,"So what IS the Green New Deal, anyway? https://t.co/hqjMSpgxDg"
8,Positive,Neutral,"(And btw - if they give a substantive answer to open a conversation, great! But words matter a great deal in moments like these, and we shouldn’t simply repeat them without giving real thought to understanding why)"
9,Positive,Positive,"Pence demanding that Harris answer *his* own personal questions when he won’t even answer the moderator’s is gross, and exemplary of the gender dynamics so many women have to deal with at work."


In [61]:
cohenkappa_df=cohenkappa_df.drop(['Text'],axis=1)  # drop Text column we do not need here

In [63]:
cohenkappa_df.head(100)  #show all rows to compute the match rows between 2 raters

Unnamed: 0,Annotator 1,Annotator 2
0,Positive,Positive
1,Positive,Neutral
2,Positive,Neutral
3,Positive,Positive
4,Negative,Positive
5,Neutral,Positive
6,Positive,Negative
7,Positive,Neutral
8,Positive,Neutral
9,Positive,Positive


In [64]:
# cohenkappa_df.replace({"Negative": 0, "Neutral": 1,"Positive":2}, inplace=True)
# cohenkappa_df.head()

In [66]:
cohenkappa_df['Annotator 1'].value_counts()

Positive    28
Neutral     12
Negative    10
Name: Annotator 1, dtype: int64

In [67]:
cohenkappa_df['Annotator 2'].value_counts()

Neutral     25
Positive    24
Negative     1
Name: Annotator 2, dtype: int64

In [68]:
len(cohenkappa_df['Annotator 1'])

50

In [69]:
##----->*****Annotator_1******
pos1= cohenkappa_df['Annotator 1'].str.contains('Positive')
Pos1=cohenkappa_df[pos1]    
neg1= cohenkappa_df['Annotator 1'].str.contains('Negative')
neg1=cohenkappa_df[neg1]    
neu1= cohenkappa_df['Annotator 1'].str.contains('Neutral')
neu1=cohenkappa_df[neu1]    

##----->*****Annotator_2******
pos2= cohenkappa_df['Annotator 2'].str.contains('Positive')
Pos2=cohenkappa_df[pos2]    
neg2= cohenkappa_df['Annotator 2'].str.contains('Negative')
neg2=cohenkappa_df[neg2]    
neu2= cohenkappa_df['Annotator 2'].str.contains('Neutral')
neu2=cohenkappa_df[neu2]    

In [72]:
#Cohen's Kappa Statistic Formula
# kappa = (P_o - P_e) / (1 - P_e)

##----->*****Annotator_1******

# Number of times Annotator_1 predicted positive
Annotator_1_positive = len(Pos1) / len(cohenkappa_df)

# Number of times Annotator_1 predicted negative
Annotator_1_negative = len(neg1) / len(cohenkappa_df)

# Number of times Annotator_1 predicted neutral
Annotator_1_neutral = len(neu1) / len(cohenkappa_df)


##----->*****Annotator_2******


# Number of times Annotator_2 predicted positive
Annotator_2_positive = len(Pos2) / len(cohenkappa_df)

# Number of times Annotator_2 predicted negative
Annotator_2_negative = len(neg2) / len(cohenkappa_df)

# Number of times Annotator_2 predicted neutral
Annotator_2_neutral = len(neu2) / len(cohenkappa_df)



# Probability both agree positive
both_agree_positive = Annotator_1_positive * Annotator_2_positive

# Probability both agree negative
both_agree_negative = Annotator_2_negative * Annotator_2_negative

# Probability both agree neutral
both_agree_neutral = Annotator_1_neutral * Annotator_2_neutral

# Sum of agreed correct answers total documents 
# the both raters agree 14 same psitive answers and 7 neutral answers from 50
P_o = (14 + 7 ) / len(cohenkappa_df)

#Probability both randomly agree
P_e = both_agree_positive + both_agree_negative+both_agree_neutral

# Calculate the kappa statistic
kappa = (P_o - P_e) / (1 - P_e)
print(f"The Cohen's kappa statistic equals: {kappa}")

The Cohen's kappa statistic equals: 0.050425671250818496


In [73]:
########just for check and make sure of our result on cohen_kappa_score libbrary#######
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score(cohenkappa_df['Annotator 1'], cohenkappa_df['Annotator 2'])

0.044795783926218746

In [4]:
# from keras.datasets import mnist
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix
# from keras.utils import np_utils
# from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
# from keras.optimizers import RMSprop
# from keras.preprocessing.image import ImageDataGenerator
# from keras.callbacks import ReduceLROnPlateau