<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/OneHotEncoding_Multiple_Cat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feature Engineering - One Hot Encoding with Multiple Categories

## List of steps for implementing OHE technique:
- Find the repeating labels in the categorical variable (f1)
- Choose top 10 repeating labels of categorical feature from the (f1)
- Apply OHE to the top 10 labels of f1 and remaining labels of f1 kept as zero

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
crop = pd.read_csv('./crop_production.csv')
crop.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [3]:
crop.isna().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [4]:
crop.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Crop_Year,246091.0,2005.643018,4.952164,1997.0,2002.0,2006.0,2010.0,2015.0
Area,246091.0,12002.820864,50523.4,0.04,80.0,582.0,4392.0,8580100.0
Production,242361.0,582503.442251,17065810.0,0.0,88.0,729.0,7023.0,1250800000.0


In [5]:
crop.select_dtypes(include='object').describe().T

Unnamed: 0,count,unique,top,freq
State_Name,246091,33,Uttar Pradesh,33306
District_Name,246091,646,BIJAPUR,945
Season,246091,6,Kharif,95951
Crop,246091,124,Rice,15104


In [6]:
crop.duplicated().any()

False

In [7]:
crop.shape

(246091, 7)

## Choose the top 10 or 20 or 30 labels according to the number of labels in categorical variable

In [8]:
for cat in crop.select_dtypes(include='object').columns.tolist():
  print(cat, ':\n')
  print(crop[cat].value_counts().sort_values(ascending=False).head(30))
  print('---' * 20)

State_Name :

Uttar Pradesh             33306
Madhya Pradesh            22943
Karnataka                 21122
Bihar                     18885
Assam                     14628
Odisha                    13575
Tamil Nadu                13547
Maharashtra               12628
Rajasthan                 12514
Chhattisgarh              10709
Andhra Pradesh             9628
West Bengal                9613
Gujarat                    8436
Haryana                    5875
Telangana                  5649
Uttarakhand                4896
Kerala                     4261
Nagaland                   3906
Punjab                     3173
Meghalaya                  2867
Arunachal Pradesh          2546
Himachal Pradesh           2494
Jammu and Kashmir          1634
Tripura                    1412
Manipur                    1267
Jharkhand                  1266
Mizoram                     957
Puducherry                  876
Sikkim                      714
Dadra and Nagar Haveli      263
Name: State_Name, dtype: i

In [9]:
# checking how many features will obtain after applying 
# one hot encoding (dummy_variables)
len_dummies = pd.get_dummies(crop, drop_first=True).shape
print('Original dataframe shape: ', crop.shape)
print('After dummies: ', len_dummies)

Original dataframe shape:  (246091, 7)
After dummies:  (246091, 808)


## Create the top 10 binary variables


In [10]:
# top_10_variables = (
#     [
#      item for item in crop[cat].value_counts()
#     .sort_values(ascending=False).head(10)
#     for cat in crop.select_dtypes(include='object').columns.tolist()
#     ]
# )

# def top_10_variables(df, cat):
#   return (
#       [
#      item for item in df[cat].value_counts()
#     .sort_values(ascending=False).head(10)
#     .index
#     ]
#   )

def top_10_labels(df, cat):
  top_10_var = (
      [
     item for item in df[cat].value_counts()
    .sort_values(ascending=False).head(10)
    .index
    ]
  )
  for label in top_10_var:
    df[label] = np.where(df[cat]==label, 1, 0)
  return top_10_var, df[label]


### State name dummies

In [11]:
top_10_state_name, state_name_dump = top_10_labels(crop, 'State_Name')
top_10_state_name

['Uttar Pradesh',
 'Madhya Pradesh',
 'Karnataka',
 'Bihar',
 'Assam',
 'Odisha',
 'Tamil Nadu',
 'Maharashtra',
 'Rajasthan',
 'Chhattisgarh']

In [12]:
state_name_dump

0         0
1         0
2         0
3         0
4         0
         ..
246086    0
246087    0
246088    0
246089    0
246090    0
Name: Chhattisgarh, Length: 246091, dtype: int64

In [13]:
crop[['State_Name'] + top_10_state_name].sample(7)

Unnamed: 0,State_Name,Uttar Pradesh,Madhya Pradesh,Karnataka,Bihar,Assam,Odisha,Tamil Nadu,Maharashtra,Rajasthan,Chhattisgarh
156221,Odisha,0,0,0,0,0,1,0,0,0,0
66322,Haryana,0,0,0,0,0,0,0,0,0,0
11646,Arunachal Pradesh,0,0,0,0,0,0,0,0,0,0
150462,Odisha,0,0,0,0,0,1,0,0,0,0
103583,Madhya Pradesh,0,1,0,0,0,0,0,0,0,0
112070,Madhya Pradesh,0,1,0,0,0,0,0,0,0,0
117621,Madhya Pradesh,0,1,0,0,0,0,0,0,0,0


In [14]:
crop.shape

(246091, 17)

### Season dummies

In [15]:
crop.select_dtypes(include='object').columns.tolist()

['State_Name', 'District_Name', 'Season', 'Crop']

In [16]:
top_10_season, season_dump = top_10_labels(crop, 'Season')
top_10_season

['Kharif     ',
 'Rabi       ',
 'Whole Year ',
 'Summer     ',
 'Winter     ',
 'Autumn     ']

In [17]:
season_dump

0         0
1         0
2         0
3         0
4         0
         ..
246086    0
246087    0
246088    0
246089    0
246090    0
Name: Autumn     , Length: 246091, dtype: int64

In [18]:
crop[['Season'] + top_10_season].sample(7)

Unnamed: 0,Season,Kharif,Rabi,Whole Year,Summer,Winter,Autumn
175366,Rabi,0,1,0,0,0,0
235151,Kharif,1,0,0,0,0,0
242,Kharif,1,0,0,0,0,0
215881,Kharif,1,0,0,0,0,0
212802,Summer,0,0,0,1,0,0
158405,Summer,0,0,0,1,0,0
66552,Kharif,1,0,0,0,0,0
