In [61]:
import pandas as pd
import numpy as np
import re
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('climate_data_final.csv')
df.head()

Unnamed: 0,state,city,rainfall,temperature,humidity
0,Abia,Aba,2747,25.6,The month of highest relative humidity is June...
1,Abia,Ahia Aba,2219,25.7,The month with the most relative humidity is S...
2,Abia,Bende,2112,26.1,The month that sees the most relative humidity...
3,Abia,Ihie Ndume,2122,25.8,The month with the highest relative humidity i...
4,Abia,Mbukwa,2112,26.1,The month with the highest relative humidity i...


In [3]:
df.shape

(294, 5)

In [4]:
df.state = df.state.str.strip()
print(df.state.unique())
print(len(df.state.unique()))

['Abia' 'Adamawa' 'Akwa Ibom' 'Anambra' 'Bauchi' 'Bayelsa' 'Benue' 'Borno'
 'Cross River' 'Delta' 'Ebonyi' 'Edo' 'Ekiti' 'Enugu'
 'Federal Capital Territory' 'Gombe' 'Imo' 'Jigawa' 'Kaduna' 'Kano'
 'Katsina' 'Kebbi' 'Kogi' 'Kwara' 'Lagos' 'Nasarawa' 'Niger' 'Ogun' 'Ondo'
 'Osun' 'Oyo' 'Plateau' 'Rivers' 'Sokoto' 'Taraba' 'Yobe' 'Zamfara']
37


In [5]:
df.humidity[:5].tolist()

['The month of highest relative humidity is June (90.79 %). The month with the lowest relative humidity is January (67.66 %).\nThe month which sees the most rainfall is October (11.20 days). The driest month of the year is January (28.67 days).',
 'The month with the most relative humidity is September (90.04 %). The month with the least relative humidity is January (61.71 %).\nThe wettest month is October (28.07 days), while the driest is December (5.67).',
 'The month that sees the most relative humidity is October (57.48 %). The month with the lowest amount of relative humidity is January (57.48 %).\nThe wettest month is October (28.07 days), whilst the driest is January (4.67).',
 'The month with the highest relative humidity is October (89.37 %). The month with the lowest relative humidity is January (58.35 %).\nThe month with the highest number of rainy days is October (28.07 days). The month with the lowest number of rainy days is January (4.80 days).',
 'The month with the high

In [6]:
doct = df.drop('city', axis=1)

In [7]:
doct['A'] = doct['humidity'].str.replace('The|number|rainy|month|with|the|highest|relative|humidity|is|\
                                        |lowest|which|sess|most|rainfall|days|of|least|%|while|year|that|\
                                        |sees|amount|percent|\n|wettest|driest|whilst|fewest|(|)', '', 
                                         regex=True)
doct = doct.drop('humidity', axis=1)
doct.head()

Unnamed: 0,state,rainfall,temperature,A
0,Abia,2747,25.6,June (90.79 ). January (67.66 )...
1,Abia,2219,25.7,September (90.04 ). January (6...
2,Abia,2112,26.1,October (57.48 ). January (...
3,Abia,2122,25.8,October (89.37 ). January (58....
4,Abia,2112,26.1,October (89.42 ). January (59....


In [8]:
doct.A[280:300].tolist()

['       August (76.15 ).         March (9.80 ).        August (0.00 ).        January (18.77 ).',
 '        August (73.97 ).         March (9.24 ).    August (16.03 ).     January (0.00 ).',
 '        August (74.84 ).         March (9.34 ).        August (16.30 ).         January (0.00 ).',
 '       August (82.39 ).         March (11.53 ).        August (0.00 ).        January (21.67 ).',
 '         August (13.16 ).           March (13.16 ).    August (23.40 ),     December (0.00).',
 '        August (81.09 ).         March (11.86 ).    August (21.97 ),     December (0.00).',
 '       August (81.73 ).         March (12.23 ).        August (0.00 ).        December (21.97 ).',
 '        August (81.16 ).         March (11.40 ).    August (21.97 ).     December (0.00 ).',
 '        August (81.02 ).         March (11.25 ).    August (21.97 ).     December (0.00 ).',
 '        August (81.98 ).         March (12.05 ).        August (21.70 ).         December (0.00 ).',
 '        August (81.0

In [9]:
doct = doct["A"].str.split(n=11, expand=True)
doct.columns = ['Humidity_{}'.format(x+1) for x in doct.columns]
doct

Unnamed: 0,Humidity_1,Humidity_2,Humidity_3,Humidity_4,Humidity_5,Humidity_6,Humidity_7,Humidity_8,Humidity_9,Humidity_10,Humidity_11,Humidity_12
0,June,(90.79,).,January,(67.66,).,October,(11.20,).,January,(28.67,).
1,September,(90.04,).,January,(61.71,).,October,(28.07,"),",December,(5.67).,
2,October,(57.48,).,January,(57.48,).,October,(28.07,"),",January,(4.67).,
3,October,(89.37,).,January,(58.35,).,October,(28.07,).,January,(4.80,).
4,October,(89.42,).,January,(59.93,).,October,(28.07,).,January,(4.67,).
...,...,...,...,...,...,...,...,...,...,...,...,...
289,August,(81.98,).,March,(12.05,).,August,(21.70,).,December,(0.00,).
290,August,(81.02,).,March,(11.25,).,August,(20.17,"),",December,(0.00).,
291,August,(83.41,).,March,(13.78,).,August,(23.83,).,December,(0.00,).
292,August,(81.16,).,March,(11.40,).,August,(21.97,).,December,(0.00,).


In [10]:
clean_weath = doct[['Humidity_2', 'Humidity_5', 'Humidity_8', 'Humidity_11']]
clean_weath['Humidity_2'] = clean_weath['Humidity_2'].str.extract(r'(\d+.\d+)').astype('float')
clean_weath['Humidity_5'] = clean_weath['Humidity_5'].str.extract(r'(\d+.\d+)').astype('float')
clean_weath['Humidity_8'] = clean_weath['Humidity_8'].str.extract(r'(\d+.\d+)').astype('float')
clean_weath['Humidity_11'] = clean_weath['Humidity_11'].str.extract(r'(\d+.\d+)').astype('float')
clean_weath

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_weath['Humidity_2'] = clean_weath['Humidity_2'].str.extract(r'(\d+.\d+)').astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_weath['Humidity_5'] = clean_weath['Humidity_5'].str.extract(r'(\d+.\d+)').astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_weath['

Unnamed: 0,Humidity_2,Humidity_5,Humidity_8,Humidity_11
0,90.79,67.66,11.20,28.67
1,90.04,61.71,28.07,5.67
2,57.48,57.48,28.07,4.67
3,89.37,58.35,28.07,4.80
4,89.42,59.93,28.07,4.67
...,...,...,...,...
289,81.98,12.05,21.70,0.00
290,81.02,11.25,20.17,0.00
291,83.41,13.78,23.83,0.00
292,81.16,11.40,21.97,0.00


In [11]:
new_df = pd.concat([df, clean_weath], axis=1)

In [12]:
new_df

Unnamed: 0,state,city,rainfall,temperature,humidity,Humidity_2,Humidity_5,Humidity_8,Humidity_11
0,Abia,Aba,2747,25.6,The month of highest relative humidity is June...,90.79,67.66,11.20,28.67
1,Abia,Ahia Aba,2219,25.7,The month with the most relative humidity is S...,90.04,61.71,28.07,5.67
2,Abia,Bende,2112,26.1,The month that sees the most relative humidity...,57.48,57.48,28.07,4.67
3,Abia,Ihie Ndume,2122,25.8,The month with the highest relative humidity i...,89.37,58.35,28.07,4.80
4,Abia,Mbukwa,2112,26.1,The month with the highest relative humidity i...,89.42,59.93,28.07,4.67
...,...,...,...,...,...,...,...,...,...
289,Zamfara,Mai Didi,590,26.9,The month with the highest relative humidity i...,81.98,12.05,21.70,0.00
290,Zamfara,Ruwan Kura,505,27.1,The month with the most relative humidity is A...,81.02,11.25,20.17,0.00
291,Zamfara,Takulawa,696,25.8,The month with the highest relative humidity i...,83.41,13.78,23.83,0.00
292,Zamfara,Unguwar Alugwaji,577,27.0,The month with the highest relative humidity i...,81.16,11.40,21.97,0.00


In [13]:
state_df = new_df.drop('humidity', axis=1)
state_df.head()

Unnamed: 0,state,city,rainfall,temperature,Humidity_2,Humidity_5,Humidity_8,Humidity_11
0,Abia,Aba,2747,25.6,90.79,67.66,11.2,28.67
1,Abia,Ahia Aba,2219,25.7,90.04,61.71,28.07,5.67
2,Abia,Bende,2112,26.1,57.48,57.48,28.07,4.67
3,Abia,Ihie Ndume,2122,25.8,89.37,58.35,28.07,4.8
4,Abia,Mbukwa,2112,26.1,89.42,59.93,28.07,4.67


In [14]:
# state_df['rainfall'] = state_df['rainfall'].str.replace(r'([a-zA-Z|\n/.,>:()'']+)', '', regex=True)
state_df['rainfall'] = state_df['rainfall'].str.extract(r'(\d+.\d+)').astype('float')

In [15]:
state_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   state        294 non-null    object 
 1   city         294 non-null    object 
 2   rainfall     286 non-null    float64
 3   temperature  294 non-null    float64
 4   Humidity_2   294 non-null    float64
 5   Humidity_5   294 non-null    float64
 6   Humidity_8   294 non-null    float64
 7   Humidity_11  294 non-null    float64
dtypes: float64(6), object(2)
memory usage: 18.5+ KB


In [16]:
state_df['mean_humidity'] = state_df[['Humidity_2', 'Humidity_5']].mean(axis=1)
state_df['mean_rainfall'] = state_df[['Humidity_8', 'Humidity_11']].mean(axis=1)
state_df_ = state_df.drop(columns=['Humidity_2', 'Humidity_5', 'Humidity_8', 'Humidity_11', 'city'])
state_df_

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall
0,Abia,2747.0,25.6,79.225,19.935
1,Abia,2219.0,25.7,75.875,16.870
2,Abia,2112.0,26.1,57.480,16.370
3,Abia,2122.0,25.8,73.860,16.435
4,Abia,2112.0,26.1,74.675,16.370
...,...,...,...,...,...
289,Zamfara,590.0,26.9,47.015,10.850
290,Zamfara,505.0,27.1,46.135,10.085
291,Zamfara,696.0,25.8,48.595,11.915
292,Zamfara,577.0,27.0,46.280,10.985


In [17]:
state_mean_df = state_df_.groupby(['state']).mean()
state_mean_df.reset_index(level=0, inplace=True)
state_mean_df

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall
0,Abia,2190.2,25.96,72.558,16.8025
1,Adamawa,683.6,27.48,35.973,11.306
2,Akwa Ibom,3033.0,25.725,81.29625,20.035
3,Anambra,1708.4,26.62,62.107,14.739
4,Bauchi,695.9,26.63,36.412,11.5975
5,Bayelsa,2824.6,26.29,81.9405,22.6915
6,Benue,1348.0,27.54,57.154,13.563
7,Borno,431.785714,27.973333,37.584,9.806333
8,Cross River,2427.2,26.58,69.687,17.541
9,Delta,2394.444444,26.13,74.549,18.3785


In [18]:
soil_df = pd.read_csv('Nigeria_soils_ocp.csv')
soil_df['state'] = soil_df['state'].str.replace('State', '', regex=True)
soil_df.head()

Unnamed: 0,city,county,state,project,Farm,Depth,Depthcode,pH,N,OC,...,PH_DESCRIP,SUITABILIT,SOIL_TEXTU,SOIL_CLASS,VEGETATION,DISTRIBUTI,SOIL_CLA_1,PERCENTAGE,MAJOR_CROP,Depth_2
0,Kantoro,Shanga,Kebbi,OCP fertilizer,Has,0-20 cm,1,6.11879,0.043971,0.478427,...,Very Strongly Acidic,Moderately Suitable,Sandy Loam,Typic Ustifluvent,Cultivated to swamp rice and oil palm,76515.95,Eutric Fluvisol,8.28,"Yam, Rice,Maize, Millet,Irish Potatoes","Deep, Mostly Deep"
1,Kantoro,Shanga,Kebbi,OCP fertilizer,Has,0-20 cm,1,6.13216,0.071001,0.612303,...,Very Strongly Acidic,Moderately Suitable,Sandy Loam,Typic Ustifluvent,Cultivated to swamp rice and oil palm,76515.95,Eutric Fluvisol,8.28,"Yam, Rice,Maize, Millet,Irish Potatoes","Deep, Mostly Deep"
2,Kawara,Shanga,Kebbi,OCP fertilizer,Has,0-20 cm,1,6.43914,0.057943,0.542822,...,Very Strongly Acidic,Moderately Suitable,Sandy Loam,Typic Ustifluvent,Cultivated to swamp rice and oil palm,76515.95,Eutric Fluvisol,8.28,"Yam, Rice,Maize, Millet,Irish Potatoes","Deep, Mostly Deep"
3,Kawara,Shanga,Kebbi,OCP fertilizer,Has,0-20 cm,1,5.76363,0.068463,0.434531,...,Very Strongly Acidic,Moderately Suitable,Sandy Loam,Typic Ustifluvent,Cultivated to swamp rice and oil palm,76515.95,Eutric Fluvisol,8.28,"Yam, Rice,Maize, Millet,Irish Potatoes","Deep, Mostly Deep"
4,Kawara,Shanga,Kebbi,OCP fertilizer,Has,0-20 cm,1,6.76575,0.05488,0.740769,...,Very Strongly Acidic,Moderately Suitable,Sandy Loam,Typic Ustifluvent,Cultivated to swamp rice and oil palm,76515.95,Eutric Fluvisol,8.28,"Yam, Rice,Maize, Millet,Irish Potatoes","Deep, Mostly Deep"


In [19]:
soil_df.shape

(3256, 36)

In [20]:
soil_df.state = soil_df.state.str.strip()
soil_df.state.value_counts()

state
Niger                        696
Kaduna                       544
Nasarawa                     503
Plateau                      420
Bauchi                       372
Taraba                       193
Katsina                      161
Kano                         127
Kebbi                         48
Kogi                          48
Benue                         42
Federal Capital Territory     35
Kwara                         10
Adamawa                        7
Bayelsa                        5
Name: count, dtype: int64

In [21]:
final_df = pd.merge(state_mean_df, soil_df, on='state')
final_df.head()

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall,city,county,project,Farm,Depth,...,PH_DESCRIP,SUITABILIT,SOIL_TEXTU,SOIL_CLASS,VEGETATION,DISTRIBUTI,SOIL_CLA_1,PERCENTAGE,MAJOR_CROP,Depth_2
0,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,"Sugarcane,Cassava,Yam,Irish Potatoes,Cocoa,Rub...","Moderately, Generally Deep"
1,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,"Sugarcane,Cassava,Yam,Irish Potatoes,Cocoa,Rub...","Moderately, Generally Deep"
2,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,0-20 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,"Sugarcane,Cassava,Yam,Irish Potatoes,Cocoa,Rub...","Moderately, Generally Deep"
3,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,0-20 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,"Sugarcane,Cassava,Yam,Irish Potatoes,Cocoa,Rub...","Moderately, Generally Deep"
4,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,0-20 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,"Sugarcane,Cassava,Yam,Irish Potatoes,Cocoa,Rub...","Moderately, Generally Deep"


In [22]:
final_df.shape

(3211, 40)

In [23]:
crop_list = final_df['MAJOR_CROP'].str.split(',')
final_df.MAJOR_CROP = crop_list
final_df = final_df.explode('MAJOR_CROP').reset_index(drop=True)
final_df.head()

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall,city,county,project,Farm,Depth,...,PH_DESCRIP,SUITABILIT,SOIL_TEXTU,SOIL_CLASS,VEGETATION,DISTRIBUTI,SOIL_CLA_1,PERCENTAGE,MAJOR_CROP,Depth_2
0,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,Sugarcane,"Moderately, Generally Deep"
1,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,Cassava,"Moderately, Generally Deep"
2,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,Yam,"Moderately, Generally Deep"
3,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,Irish Potatoes,"Moderately, Generally Deep"
4,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,20-40 cm,...,Strogly acidic to Slightly Acidic,Moderately Suitable,Clay Loam,Oxic Ustropept,Grass,92048.48,Eutric cambisol,9.96,Cocoa,"Moderately, Generally Deep"


In [24]:
final_df.shape

(24813, 40)

In [25]:
final_df['Depth'] = final_df['Depth'].str.replace('cm', '', regex=True)
final_df.Depth = final_df.Depth.str.replace(" ", '')
final_df.SOIL_PH = final_df.SOIL_PH.str.replace(" ", '')
final_df.SLOPE = final_df.SLOPE.str.replace(" ", '')
final_df['SLOPE'] = final_df['SLOPE'].str.replace('%', '', regex=True)
final_df[['Depth1', 'Depth2']] = final_df['Depth'].str.split('-|–', expand=True)
final_df[['SOIL_PH1', 'SOIL_PH2']] = final_df['SOIL_PH'].str.split('-', expand=True)
final_df[['col', 'slope']] = final_df['SLOPE'].str.split('-', expand=True)

final_df = final_df.drop(columns=['Depth', 'SOIL_PH', 'Depthcode', 'col', 'SLOPE'])
final_df

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall,city,county,project,Farm,pH,...,DISTRIBUTI,SOIL_CLA_1,PERCENTAGE,MAJOR_CROP,Depth_2,Depth1,Depth2,SOIL_PH1,SOIL_PH2,slope
0,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,6.37594,...,92048.48,Eutric cambisol,9.96,Sugarcane,"Moderately, Generally Deep",20,40,5.5,6.3,2
1,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,6.37594,...,92048.48,Eutric cambisol,9.96,Cassava,"Moderately, Generally Deep",20,40,5.5,6.3,2
2,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,6.37594,...,92048.48,Eutric cambisol,9.96,Yam,"Moderately, Generally Deep",20,40,5.5,6.3,2
3,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,6.37594,...,92048.48,Eutric cambisol,9.96,Irish Potatoes,"Moderately, Generally Deep",20,40,5.5,6.3,2
4,Adamawa,683.6,27.48,35.973,11.306,,Mayo-Belwa,OCP fertilizer,Nm,6.37594,...,92048.48,Eutric cambisol,9.96,Cocoa,"Moderately, Generally Deep",20,40,5.5,6.3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24808,Taraba,2018.6,23.94,52.289,13.312,,Ardo-Kola,OCP fertilizer,Nm,6.11449,...,92048.48,Dystric Nitosol,9.96,Groundnut,"Moderately, Generally Deep",0,20,5.5,6.3,2
24809,Taraba,2018.6,23.94,52.289,13.312,,Ardo-Kola,OCP fertilizer,Nm,6.11449,...,92048.48,Dystric Nitosol,9.96,MilletSorghum,"Moderately, Generally Deep",0,20,5.5,6.3,2
24810,Taraba,2018.6,23.94,52.289,13.312,,Ardo-Kola,OCP fertilizer,Nm,6.11449,...,92048.48,Dystric Nitosol,9.96,Cowpea,"Moderately, Generally Deep",0,20,5.5,6.3,2
24811,Taraba,2018.6,23.94,52.289,13.312,,Ardo-Kola,OCP fertilizer,Nm,6.11449,...,92048.48,Dystric Nitosol,9.96,Rice,"Moderately, Generally Deep",0,20,5.5,6.3,2


In [26]:
final_df.Depth1 = final_df.Depth1.astype(float)
final_df.Depth2 = final_df.Depth2.astype(float)
final_df.SOIL_PH1 = final_df.SOIL_PH1.astype(float)
final_df.SOIL_PH2 = final_df['SOIL_PH2'].str.extract(r'(\d+.\d+)').astype('float')
final_df.slope = final_df.slope.astype(float)

In [27]:
final_df['avg_depth'] = final_df[['Depth1', 'Depth2']].mean(axis=1)
final_df['avg_soil_ph'] = final_df[['SOIL_PH1', 'SOIL_PH1']].mean(axis=1)

final_df = final_df.drop(columns=['Depth1', 'Depth2', 'SOIL_PH1', 'SOIL_PH2', 'city', 'project', 'ID', 
                                'project', 'MAPPING_UN', 'GEOLOGY', 'ECOLOGICAL', 'DRAINAGE', 'PH_DESCRIP', 
                                'SUITABILIT', 'SOIL_TEXTU', 'DISTRIBUTI', 'PERCENTAGE'])

In [28]:
final_df

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall,county,Farm,pH,N,OC,...,Mn,Fe,SOIL_CLASS,VEGETATION,SOIL_CLA_1,MAJOR_CROP,Depth_2,slope,avg_depth,avg_soil_ph
0,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Sugarcane,"Moderately, Generally Deep",2.0,30.0,5.5
1,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Cassava,"Moderately, Generally Deep",2.0,30.0,5.5
2,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Yam,"Moderately, Generally Deep",2.0,30.0,5.5
3,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Irish Potatoes,"Moderately, Generally Deep",2.0,30.0,5.5
4,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Cocoa,"Moderately, Generally Deep",2.0,30.0,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24808,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Groundnut,"Moderately, Generally Deep",2.0,10.0,5.5
24809,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,MilletSorghum,"Moderately, Generally Deep",2.0,10.0,5.5
24810,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Cowpea,"Moderately, Generally Deep",2.0,10.0,5.5
24811,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Rice,"Moderately, Generally Deep",2.0,10.0,5.5


In [29]:
final_df.MAJOR_CROP.value_counts()

MAJOR_CROP
Yam                       2707
Maize                     2677
Sorghum                   2593
Cotton                    2243
Cassava                   2028
Groundnut                 1847
Cocoa                     1157
Rice                      1022
Irish Potatoes            1000
Cowpea                     936
Millet                     891
 Millet                    878
Sugarcane                  375
 Rice                      352
Vegetables                 309
Cow Pea                    293
  Banana                   293
OilPalm                    265
Rubber                     254
MilletSorghum              250
Plantain                   249
Millets                    246
Oil Palm                   234
Acha                       233
Cass                       211
Bean                       207
SugarCan                   207
Upland rice                192
Cocoa yam                  182
Cocoayams.                 108
G.nuts                      83
Beans                       

In [56]:
crop_df = final_df[final_df['MAJOR_CROP'].isin(['Yam', 'Maize', 'Sorghum', 'Cotton', 'Cassava', 'Groundnut',
                                                'Cocoa', 'Rice', 'Irish Potatoes', 'Cowpea', 'Millet',
                                                ' Millet', ' Rice', 'Cow Pea', 'Vegetables', 'Bean', 'Beans',
                                                'G.nuts', 'Upland rice', 'Cocoa yam', ' Groundnut', ' Yam',
                                                ' Maize', ' Cowpea', 'Maize.Cocoa', 'Vegetable', 'Millets'])]
crop_df

Unnamed: 0,state,rainfall,temperature,mean_humidity,mean_rainfall,county,Farm,pH,N,OC,...,Mn,Fe,SOIL_CLASS,VEGETATION,SOIL_CLA_1,MAJOR_CROP,Depth_2,slope,avg_depth,avg_soil_ph
1,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Cassava,"Moderately, Generally Deep",2.0,30.0,5.5
2,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Yam,"Moderately, Generally Deep",2.0,30.0,5.5
3,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Irish Potatoes,"Moderately, Generally Deep",2.0,30.0,5.5
4,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Cocoa,"Moderately, Generally Deep",2.0,30.0,5.5
6,Adamawa,683.6,27.48,35.973,11.306,Mayo-Belwa,Nm,6.37594,0.044609,0.609208,...,89.2675,102.2200,Oxic Ustropept,Grass,Eutric cambisol,Groundnut,"Moderately, Generally Deep",2.0,30.0,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24805,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Irish Potatoes,"Moderately, Generally Deep",2.0,10.0,5.5
24806,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Cocoa,"Moderately, Generally Deep",2.0,10.0,5.5
24808,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Groundnut,"Moderately, Generally Deep",2.0,10.0,5.5
24810,Taraba,2018.6,23.94,52.289,13.312,Ardo-Kola,Nm,6.11449,0.029150,0.508449,...,35.7464,50.4691,Typic Paleustult,Sorghum and Groundnut,Dystric Nitosol,Cowpea,"Moderately, Generally Deep",2.0,10.0,5.5


In [33]:
# y = crop_df['MAJOR_CROP']
# X = crop_df.drop(columns=['MAJOR_CROP'], axis=1)

In [57]:
crop_df.shape

(22008, 27)

In [66]:
final_crop = crop_df.drop(columns=['county', 'SOIL_CLASS', 'SOIL_CLA_1', 'Depth_2'])
final_crop = final_crop.reset_index()
final_crop

Unnamed: 0,index,state,rainfall,temperature,mean_humidity,mean_rainfall,Farm,pH,N,OC,...,Na,Zn,Cu,Mn,Fe,VEGETATION,MAJOR_CROP,slope,avg_depth,avg_soil_ph
0,1,Adamawa,683.6,27.48,35.973,11.306,Nm,6.37594,0.044609,0.609208,...,14.96620,1.28535,0.860957,89.2675,102.2200,Grass,Cassava,2.0,30.0,5.5
1,2,Adamawa,683.6,27.48,35.973,11.306,Nm,6.37594,0.044609,0.609208,...,14.96620,1.28535,0.860957,89.2675,102.2200,Grass,Yam,2.0,30.0,5.5
2,3,Adamawa,683.6,27.48,35.973,11.306,Nm,6.37594,0.044609,0.609208,...,14.96620,1.28535,0.860957,89.2675,102.2200,Grass,Irish Potatoes,2.0,30.0,5.5
3,4,Adamawa,683.6,27.48,35.973,11.306,Nm,6.37594,0.044609,0.609208,...,14.96620,1.28535,0.860957,89.2675,102.2200,Grass,Cocoa,2.0,30.0,5.5
4,6,Adamawa,683.6,27.48,35.973,11.306,Nm,6.37594,0.044609,0.609208,...,14.96620,1.28535,0.860957,89.2675,102.2200,Grass,Groundnut,2.0,30.0,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22003,24805,Taraba,2018.6,23.94,52.289,13.312,Nm,6.11449,0.029150,0.508449,...,9.27022,0.82956,0.072998,35.7464,50.4691,Sorghum and Groundnut,Irish Potatoes,2.0,10.0,5.5
22004,24806,Taraba,2018.6,23.94,52.289,13.312,Nm,6.11449,0.029150,0.508449,...,9.27022,0.82956,0.072998,35.7464,50.4691,Sorghum and Groundnut,Cocoa,2.0,10.0,5.5
22005,24808,Taraba,2018.6,23.94,52.289,13.312,Nm,6.11449,0.029150,0.508449,...,9.27022,0.82956,0.072998,35.7464,50.4691,Sorghum and Groundnut,Groundnut,2.0,10.0,5.5
22006,24810,Taraba,2018.6,23.94,52.289,13.312,Nm,6.11449,0.029150,0.508449,...,9.27022,0.82956,0.072998,35.7464,50.4691,Sorghum and Groundnut,Cowpea,2.0,10.0,5.5


In [81]:
vect = CountVectorizer()

vect_crop = vect.fit_transform(final_crop)
print("Count Matrix:", vect_crop.toarray())

Count Matrix: [[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0

In [82]:
cosine_sim = cosine_similarity(vect_crop)

In [114]:
crops_to_plant = "Rice"

def get_index_from_title(crop):
    return final_crop[final_crop.MAJOR_CROP == crop]["index"].values[0]

movie_index = get_index_from_title(crops_to_plant)

In [115]:
similar_crops = list(enumerate(cosine_sim[movie_index]))

In [116]:
sorted_similar_crop = sorted(similar_crops, key=lambda x:x[1], reverse=True)

In [118]:
def get_title_from_index(index):
    return final_crop[final_crop.index == index]["MAJOR_CROP"].values[0]
i=0
for crop in sorted_similar_crop:
    print(get_title_from_index(crop[0]))
    i=i+1
    if i>5:
        break

Irish Potatoes
Cassava
Yam
Irish Potatoes
Cocoa
Groundnut


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify=y, random_state = 42)

In [38]:
X_train.shape

(17606, 22)

In [39]:
y_train.shape

(17606,)

## Converting categorical variables to categorical types

In [42]:
X_train['state'] = X_train['state'].astype('category')
X_train['VEGETATION'] = X_train['VEGETATION'].astype('category')
X_train['Farm'] = X_train['Farm'].astype('category')

In [43]:
y_train.value_counts()

MAJOR_CROP
Yam               2166
Maize             2142
Sorghum           2074
Cotton            1794
Cassava           1622
Groundnut         1509
Millet            1415
Rice              1099
Cocoa              926
Irish Potatoes     800
Cowpea             765
Vegetables         247
Cow Pea            234
Millets            197
Bean               166
Upland rice        154
Cocoa yam          146
Beans               66
G.nuts              66
Maize.Cocoa         16
Vegetable            2
Name: count, dtype: int64

## class balancing using SMOTENC model

In [44]:
smotenc = SMOTENC(categorical_features=[0, 5, 18],random_state = 101, sampling_strategy='minority', k_neighbors=1)
X_oversample, y_oversample = smotenc.fit_resample(X_train, y_train)

In [45]:
y_oversample.value_counts()

MAJOR_CROP
Vegetable         2166
Yam               2166
Maize             2142
Sorghum           2074
Cotton            1794
Cassava           1622
Groundnut         1509
Millet            1415
Rice              1099
Cocoa              926
Irish Potatoes     800
Cowpea             765
Vegetables         247
Cow Pea            234
Millets            197
Bean               166
Upland rice        154
Cocoa yam          146
Beans               66
G.nuts              66
Maize.Cocoa         16
Name: count, dtype: int64

In [46]:
X_oversample_ = X_oversample.drop(columns=['state', 'Farm', 'VEGETATION'])

## Feature Selection

In [47]:
sel_df = SelectKBest(chi2, k=19,)

x_train = sel_df.fit_transform(X_oversample_, y_oversample)
df = pd.DataFrame(x_train)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,2018.6,23.94,52.289,13.312,6.07501,0.026509,0.334598,4.35365,100.811,51.4264,53.3159,9.96813,0.734818,0.149907,59.0239,77.304,6.0,10.0,6.6
1,1208.6,28.06,42.321,13.286,5.86549,0.059465,0.628293,2.37832,182.387,124.158,57.4501,12.4162,1.03509,0.785175,0.0,0.0,6.0,10.0,6.6
2,1329.8,23.74,33.557,13.707,6.1673,0.050161,0.754754,3.78322,146.035,120.34,52.6711,14.8295,1.22581,0.759079,65.7194,120.843,2.0,10.0,5.5
3,586.0,28.93,36.1985,10.7755,6.21553,0.040416,0.613525,3.04388,217.425,102.438,94.1114,10.1371,1.39772,0.435965,55.5662,83.8361,2.0,10.0,6.6
4,1390.0,27.52,54.909,13.797,6.22479,0.058408,0.620526,4.12574,276.786,100.911,72.7219,9.44642,1.28456,0.220096,47.3654,146.694,13.0,10.0,5.8
