In [157]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# adjust settings to display all columns
pd.set_option('display.max_columns', None)

In [158]:
# path to the excel file
file_path = 'dataset/Case Study for Data Scientist.xlsx'
# read the second sheet
df = pd.read_excel(file_path, sheet_name = 1, skiprows = 1)

In [159]:
# view columns
df.columns

Index(['Day', 'Date', 'Region', 'Mill Code', 'Mill Type', 'Actual OER %',
       'Crop Freshness Score', 'Ripe %', 'Long Stalk %', 'Rat Damage %',
       'Loose Fruits %', 'Rainfall (mm)', 'Age Profile (years)',
       'Total Oil Losses %', 'Downtime %', 'FFB Processed (MT)', 'Seed A %',
       'Seed B %', 'Other Seeds %', 'Coastal %', 'Inland %'],
      dtype='object')

In [160]:
# view overall data
df.head(10)

Unnamed: 0,Day,Date,Region,Mill Code,Mill Type,Actual OER %,Crop Freshness Score,Ripe %,Long Stalk %,Rat Damage %,Loose Fruits %,Rainfall (mm),Age Profile (years),Total Oil Losses %,Downtime %,FFB Processed (MT),Seed A %,Seed B %,Other Seeds %,Coastal %,Inland %
0,Wed,2020-01-01,R01,Z001,IP,0.0,300.0,95.15767,1.241942,4.734563,6.833731,0.0,15.721261,0.0,0.0,0.0,31.114312,0.0,68.885688,49.290868,50.709132
1,Thu,2020-01-02,R01,Z001,IP,22.27,289.458286,95.430159,1.85971,3.931418,7.475889,3.342912,13.950932,1.387521,0.254744,648.373,39.927122,0.0,60.072878,46.188086,53.811914
2,Fri,2020-01-03,R01,Z001,IP,22.55,262.890977,94.617481,1.514098,3.568296,7.416632,24.355107,13.322048,1.484348,0.058243,530.021,37.392953,0.0,62.607047,45.361864,54.638136
3,Sat,2020-01-04,R01,Z001,IP,22.12,262.653835,95.404031,1.457055,4.809152,7.225248,1.208765,14.120394,1.36469,0.099324,702.527,33.681927,0.0,66.318073,51.698756,48.301244
4,Sun,2020-01-05,R01,Z001,IP,0.0,,,,,0.0,,,0.0,0.0,0.0,,,,,
5,Mon,2020-01-06,R01,Z001,IP,0.0,296.263902,94.621112,0.525784,4.608052,7.744236,0.0,11.764399,0.0,0.0,0.0,34.267461,0.0,65.732539,49.753684,50.246316
6,Tue,2020-01-07,R01,Z001,IP,21.44,258.003938,96.388443,1.603789,4.425478,7.300997,0.0,11.689557,1.309626,0.208859,925.635,41.379892,0.0,58.620108,47.450798,52.549202
7,Wed,2020-01-08,R01,Z001,IP,21.93,244.123776,96.317309,1.531533,2.679098,6.554261,5.066349,14.76742,1.616845,0.074198,458.272,34.239454,0.0,65.760546,52.625872,47.374128
8,Thu,2020-01-09,R01,Z001,IP,22.03,259.805635,94.747949,1.676143,2.971952,7.158706,5.404249,12.884193,1.545609,0.093896,530.071,38.851989,0.0,61.148011,55.053023,44.946977
9,Fri,2020-01-10,R01,Z001,IP,22.02,243.863031,95.482388,1.737853,3.706116,10.234498,22.269123,13.096816,1.551516,0.085219,535.147,23.731303,0.0,76.268697,45.334915,54.665085


In [161]:
# check data types
print(df.dtypes)

Day                             object
Date                    datetime64[ns]
Region                          object
Mill Code                       object
Mill Type                       object
Actual OER %                   float64
Crop Freshness Score           float64
Ripe %                         float64
Long Stalk %                   float64
Rat Damage %                   float64
Loose Fruits %                 float64
Rainfall (mm)                  float64
Age Profile (years)            float64
Total Oil Losses %             float64
Downtime %                     float64
FFB Processed (MT)             float64
Seed A %                       float64
Seed B %                       float64
Other Seeds %                  float64
Coastal %                      float64
Inland %                       float64
dtype: object


In [162]:
# check for missing values
print(df.isnull().sum())

Day                        0
Date                       0
Region                     0
Mill Code                  0
Mill Type                  0
Actual OER %              31
Crop Freshness Score    9708
Ripe %                  9708
Long Stalk %            9708
Rat Damage %            9708
Loose Fruits %          1766
Rainfall (mm)           9708
Age Profile (years)     9548
Total Oil Losses %         0
Downtime %                46
FFB Processed (MT)        31
Seed A %                9344
Seed B %                9344
Other Seeds %           9344
Coastal %               9344
Inland %                9344
dtype: int64


In [163]:
# statistical summary
print(df.describe())

                                Date  Actual OER %  Crop Freshness Score  \
count                          56512  56481.000000          46804.000000   
mean   2022-06-01 11:59:59.999999744     14.642561            271.093671   
min              2020-01-01 00:00:00      0.000000            100.000000   
25%              2021-03-17 00:00:00      0.000000            259.511313   
50%              2022-06-01 12:00:00     20.080000            273.725582   
75%              2023-08-17 00:00:00     21.070000            287.036451   
max              2024-10-31 00:00:00     34.690000            300.000000   
std                              NaN      9.422467             22.214956   

             Ripe %  Long Stalk %  Rat Damage %  Loose Fruits %  \
count  46804.000000  46804.000000  46804.000000    54746.000000   
mean      82.917508      1.802622      8.937942        7.602551   
min        0.000000      0.000000      0.000000        0.000000   
25%       78.474020      0.401135      2.588567

In [164]:
# get the initial total number of row
initial_total_df = len(df)

In [165]:
# impute or drop missing values
# impute numerical columns with median
numerical_cols = df.select_dtypes(include = [np.number]).columns
# df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# drop rows with missing values for categorical columns
categorical_cols = df.select_dtypes(exclude = [np.number]).columns
df = df.dropna(subset = categorical_cols)

# alternative drop rows with missing values for numerical columns
df = df.dropna(subset = numerical_cols)

In [166]:
# check for missing values after imputation
print(df.isnull().sum())

Day                     0
Date                    0
Region                  0
Mill Code               0
Mill Type               0
Actual OER %            0
Crop Freshness Score    0
Ripe %                  0
Long Stalk %            0
Rat Damage %            0
Loose Fruits %          0
Rainfall (mm)           0
Age Profile (years)     0
Total Oil Losses %      0
Downtime %              0
FFB Processed (MT)      0
Seed A %                0
Seed B %                0
Other Seeds %           0
Coastal %               0
Inland %                0
dtype: int64


In [167]:
# get total number of row after imputation
after_total_df = len(df)
after_total_df

46593

In [168]:
# ensure correct data types
df['Date'] = pd.to_datetime(df['Date']) # convert 'Date' to datetime
df[numerical_cols] = df[numerical_cols].astype(float) # ensure all numerics are floats

# encode categorical variables
# convert 'Day' to numeric (Monday = 0, Sunday = 6)
day_mapping = {'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6}
df['Day'] = df['Day'].map(day_mapping)

# one-hot encode 'Mill Type', 'Region', and 'Mill Code'
# initialize the OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the categorical columns
encoded = encoder.fit_transform(df[['Region', 'Mill Code', 'Mill Type']])

# normalize or scale the numerical columns (optional)
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df

Unnamed: 0,Day,Date,Region,Mill Code,Mill Type,Actual OER %,Crop Freshness Score,Ripe %,Long Stalk %,Rat Damage %,Loose Fruits %,Rainfall (mm),Age Profile (years),Total Oil Losses %,Downtime %,FFB Processed (MT),Seed A %,Seed B %,Other Seeds %,Coastal %,Inland %
0,2,2020-01-01,R01,Z001,IP,0.000000,1.000000,0.951577,0.040434,0.091640,0.078533,0.000000,0.622624,0.000000,0.000000,0.000000,0.311143,0.0,0.688857,0.492909,0.507091
1,3,2020-01-02,R01,Z001,IP,0.641972,0.947291,0.954302,0.060546,0.076095,0.085913,0.016318,0.552512,0.173379,0.062731,0.350277,0.399271,0.0,0.600729,0.461881,0.538119
2,4,2020-01-03,R01,Z001,IP,0.650043,0.814455,0.946175,0.049294,0.069067,0.085232,0.118888,0.527606,0.185478,0.014342,0.286338,0.373930,0.0,0.626070,0.453619,0.546381
3,5,2020-01-04,R01,Z001,IP,0.637648,0.813269,0.954040,0.047437,0.093084,0.083033,0.005901,0.559224,0.170526,0.024459,0.379533,0.336819,0.0,0.663181,0.516988,0.483012
5,0,2020-01-06,R01,Z001,IP,0.000000,0.981320,0.946211,0.017118,0.089192,0.088997,0.000000,0.465917,0.000000,0.000000,0.000000,0.342675,0.0,0.657325,0.497537,0.502463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56506,5,2024-10-26,R03,Z031,MB,0.592390,0.921041,0.929654,0.000000,0.144446,0.101378,0.000606,0.617909,0.178890,0.021535,0.387888,0.562364,0.0,0.437636,0.000000,1.000000
56507,6,2024-10-27,R03,Z031,MB,0.000000,0.942326,0.933065,0.000000,0.145484,0.093098,0.001936,0.661509,0.000000,0.000000,0.000000,0.702813,0.0,0.297187,0.000000,1.000000
56508,0,2024-10-28,R03,Z031,MB,0.578265,0.897958,0.915140,0.034137,0.152483,0.130552,0.000000,0.633745,0.177201,0.013710,0.443842,0.567887,0.0,0.432113,0.000000,1.000000
56509,1,2024-10-29,R03,Z031,MB,0.592390,0.917043,0.902718,0.074867,0.193115,0.106474,0.000000,0.636642,0.212003,0.042844,0.428396,0.440362,0.0,0.559638,0.000000,1.000000
