In [1]:
# !pip install catboost
# !pip install xgboost

# 1. Import Necessary Libraries 

In [2]:
# import necessary libraries
import pandas as pd #library for data manipulation
import numpy as np # library for working with arrays

# import preprocessing libraries

# creating visualizations in Python
import matplotlib.pyplot as plt
import plotly.express as px
from pylab import rcParams
import seaborn as sns
sns.set_style('darkgrid')
rcParams['figure.figsize'] = 8,8
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# import modelling libraries
from catboost import CatBoostRegressor #features built in support for handling categorical features
from xgboost import XGBRegressor #designed to be fast and efficient: for both classification and regression
from sklearn.ensemble import RandomForestRegressor #creates multiple decision trees
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RepeatedStratifiedKFold

# pd.options.display.max_rows=2000
pd.set_option('display.max_rows', 100) #sets maximum rows to 100
pd.set_option('display.max_columns', 30) #sets maximum columns to 30

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'catboost'

# 2. Read Data

In [None]:
# read csv data
df = pd.read_csv('wfp_food_prices_ken.csv', parse_dates=True, index_col='date')

In [None]:
# check sample data/records
df.sample(5)

# 3. Data Type description and statistical information

In [None]:
# check shape of the dataset
df.shape

In [None]:
# check information of the dataset
df.info()

### 
* From the above, all columns are categorical. Shows there are anomalies since some 
  columns cant have object as a datatype i.e price etc.
* No missing values from the dataset

In [None]:
# check statistical information of numerical data
# df.describe()

# 4. Data Preprocessing

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# remove duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
# check for missing values and sorting the issue
missing_perc = df.isnull().mean()*100
missing_values = pd.DataFrame({'column_name' : df.columns, 'Missing Percentage' : missing_perc})
missing_values

### 
- No missing values from the dataset

In [None]:
# make a copy of the dataframe
df1 = df.copy()

In [None]:
# rename some columns
df1 = df1.rename(columns={'admin1': 'Province', 'admin2': 'County'})

In [None]:
# column names
print(*df1.columns, sep='\n')

In [None]:
# check datatypes of the above columns
for col in df1:
    print(f'{col} : {df1[col].dtypes}')

In [None]:
# remove some patterns in the columns you want to convert into numerical
num_cols = ['latitude', 'longitude', 'price', 'usdprice']
import re
def numeric(text):
    text = text.lower()
    if re.search(r'#', text):
        text = 0
    return text

df1[num_cols] = df1[num_cols].applymap(numeric)

In [None]:
# convert necessary columns to their respective datatypes
num_cols = ['latitude', 'longitude', 'price', 'usdprice']
for num in num_cols:
    df1[num] = df1[num].astype('float64')
    
df1.dtypes

### 
- I now have a clean dataset with their corresponding data types

In [None]:
# check statistical information of numerical data
df1.describe()

### 
- No outliers in my dataset

### 4.1 Quality mapping

In [None]:
# columns
df1.columns

In [None]:
# check category 
print(*df1['category'].unique(), sep='\n')

### 
- category seems to be ok

In [None]:
# check commodity 
print(*df1['commodity'].unique(), sep='\n')

In [None]:
# do a quality mapping on commodity column
import re
def commodity(text):
    text = text.lower()
    if re.match(r'^ma', text):
        text = 'Maize'
    if re.match(r'^b', text):
        text = 'Beans'
    if re.match(r'^p', text):
        text = 'Potatoes'
    if re.match(r'^mi', text):
        text = 'Milk'
    if re.match(r'^fu', text):
        text = 'Fuel'
    if re.match(r'^mea', text):
        text = 'Meat'
    if re.match(r'^oi', text):
        text = 'Oil'
    if re.match(r'^on', text):
        text = 'Onion'
    if re.match(r'^ri', text):
        text = 'Rice'
    if re.match(r'^so', text):
        text = 'Sorghum'
    if re.match(r'^cow', text):
        text = 'Cowpeas'
    if re.match(r'^mil', text):
        text = 'Millet'
    if re.match(r'^fis', text):
        text = 'Fish'
    return text
        
df1['commodity'] = df1['commodity'].apply(commodity)
print('Unique values in Commodity column: ')
print('-------------------------------')
print(*df1['commodity'].unique(), sep='\n')
print('-------------------------------')

In [None]:
# check unit column 
print(*df1['unit'].unique(), sep='\n')

In [None]:
# do a quality mapping on unit column
import re
def unit(text):
    text = text.lower()
    if text == 'bunch' or text == 'head':
        return text
    text = re.sub(r'(\d+\s*kg|\d+\s*g)', '1 KG', text)
    text = re.sub(r'(\d+\s*ml|\d+\s*l)', '1 L', text)
    text = re.sub(r'(?<!\d)l(?!\w)', '1 L', text)
    return text



df1['unit'] = df1['unit'].apply(unit)
print('Unique values in Unit column: ')
print('-------------------------------')
print(*df1['unit'].unique(), sep='\n')
print('-------------------------------')

In [None]:
# df1.columns

In [None]:
# confirm unique values of our columns
excluded_cols = ['Province', 'County', 'market', 'price', 'usdprice', 'latitude', 'longitude']
for col in df1.columns:
    if col not in excluded_cols:
        unique_values = "\n".join(df1[col].unique().astype(str))
        print(f'{col} : {df1[col].nunique()} : {unique_values}\n')
        print('------------------------------------------')

In [None]:
# df1.dtypes

# 5. Feature Engineering
- Here we will pick our essential features for training the model

In [None]:
# let us see our columns
df1.columns

In [None]:
# create a copy of the df1 column
df2 = df1.copy()

In [None]:
# drop unnecessary columns
df2.drop(columns=['usdprice', 'currency'], axis=1, inplace=True)

In [None]:
# new columns to fit our model with
df2.columns

In [None]:
df2.dtypes

In [None]:
# encode the choosen columns
from sklearn.preprocessing import LabelEncoder

cat_cols = [column for column in df2.columns if df2[column].dtype == 'object']

# binary cols are the columns with only two unique values
binary_cols = [column for column in cat_cols if df2[column].nunique() == 3]
print(*binary_cols)

In [None]:
df2 = pd.get_dummies(df2, columns=binary_cols, drop_first=True)
dummied_cols = [column for column in df2.columns if column not in cat_cols and column not in binary_cols]
df2.head()

In [None]:
# Encode the categorical variables to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cat_cols = [column for column in df2.columns if df2[column].dtype == 'object']

# label encode the categorical columns
for column in cat_cols:
    df2[column] = le.fit_transform(df2[column])

df2.head()

In [None]:
# split into train and test split
X = df2.drop('price', axis=1)
y = df2['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Initialize the model
from sklearn.metrics import r2_score
cat = CatBoostRegressor(loss_function='RMSE', n_estimators=100, learning_rate=0.05, max_depth=5)
xgb = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=5)
linear = LinearRegression()
rf = RandomForestRegressor()
models= [cat,linear, rf,xgb]

# Fit the model to the training data
for model in models:
    model = model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    #metric
    mse = r2_score(y_test, y_pred) * 100
    print(f'error for {model} is: {mse}')

# CONCLUSION!
- From the trained models above,.. RandomForestRegressor, XGBRegressor and CatBoostRegressor are doint exemplary good. RandomForestRegressor tends to be best... tuning both XGBRegressor and CatBoostRegressor can be good