In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# **THE CONTEXT**

Before we begin our project, we have to understand the data and the problem, what is the problem that we want to solve? is the data ready to process or we have to clean it? and so on. So let's break it down:

**The Data**

The data came from The Flotation Plant. The data have several columns,the first column is date, the second and third columns are quality measures of the iron ore pulp right before it is fed into the flotation plant. Column 4 until column 8 are the most important variables that impact in the ore quality in the end of the process. From column 9 until column 22, we can see process data (level and air flow inside the flotation columns, which also impact in ore quality. The last two columns are the final iron ore pulp quality measurement from the lab. 

**The Problem**
1. The aim is to predict the % Silica in Concentrate ever minute
2. How many steps (hours) ahead can we predict % Silica in Concentrate?
3. can we predict % Silica in Concentrate without using % Iron Concentrate?

**DATA PREPROCESSING**

The first step in data science is data preprocessing. In this step,we have to:
1. Handling Variable (Numerical, Categorical, Date/Time)
2. Handling Missing Values
3. Handling Outliers
4. Scaling

**IMPORTING LIBRARIES**

importing several libraries to process the data,
* pandas for data manipulation
* numpy for array processing 
* matplotlib for visualizatioin
* seaborn for visualization but more advance in statistics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**LOAD DATA**

In [None]:
# we load the data, so we have to convert the date column into date and the others to float/integer
# and we have to drop the duplicates entries/row using code below

data =  pd.read_csv('../input/quality-prediction-in-a-mining-process/MiningProcess_Flotation_Plant_Database.csv',
                   decimal=",",
                    parse_dates=["date"],
                    infer_datetime_format=True).drop_duplicates()
data.info()

In [None]:
#Check the hape of data (row and column)
data.shape

In [None]:
#check the data if there any missing value or not
data.isnull().sum()

In [None]:
#Display the data, and observe what kind of the data is these
data.head()

In [None]:
#in this case, we use heatmap to visualize the corealtion between each features

plt.figure(figsize=(30, 30))
cor= data.corr()
corelation = sns.heatmap(cor, annot=True, cmap="RdYlGn")

**Pereparing Dataset For ML**

As you the result of code above, we saw there are several feature that  not really necessarily affect 
depentdent feature, so we can drop the features and keep the most and more corelated independent features

In [None]:
#Drop data that there are no significant corelation on dependent feature

#data = data.drop(['date', 
             # '% Iron Concentrate', 
              #'Ore Pulp pH', 
              #'Flotation Column 01 Air Flow', 
              #'Flotation Column 02 Air Flow', 
              #'Flotation Column 03 Air Flow'], axis=1)

#Correlation with output variable
cor_target = abs(cor["% Silica Concentrate"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.15]
relevant_features

In [None]:
#We pick the 3 biggest corelation
relevant_features = relevant_features.nlargest(n=3)

In [None]:
#Make a data from the releant features
data = pd.DataFrame(data, columns=relevant_features.index)
data.head()

In [None]:
#Check The Distribution
sns.distplot(data['Flotation Column 01 Air Flow'])

In [None]:
#Check The Distribution
sns.distplot(data['% Iron Concentrate'])

In [None]:
#Checking The Outlier in our data
sns.boxplot(data['Flotation Column 01 Air Flow'])

In [None]:
#Checking The Outlier in our data
sns.boxplot(data['% Iron Concentrate'])

In [None]:
#Checking The Outlier in our data
sns.boxplot(data['% Silica Concentrate'])

In [None]:
#Dropping the outlier with Percentiles
for i in data:
    upper_lim = data[i].quantile(.95)
    lower_lim = data[i].quantile(.05)

    data = data[(data[i] < upper_lim) & (data[i] > lower_lim)]

In [None]:
# Before we split into train and test data, as we can see, the data have differents in units and magnitude
# So to make it at the same magnitude we can scaling the data

Y = data['% Silica Concentrate']
X = data.drop(['% Silica Concentrate'], axis=1)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
# After we scaled the data, and the data have the same magnitude
# we can split the data into Train & Test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                    Y,
                                                    test_size=0.3,
                                                   random_state=30)

**Making A Model**

after preprocessing step, we move to the model. In this step we have to figure the best model/algorithm that conclude the highest accuracy, but not overfitting or underfitting.

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

**EVALUATING MODEL**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

MSE = mean_squared_error(y_test, y_pred)
print('Our mean squared error is: ',MSE)

MAE = mean_absolute_error(y_test, y_pred)
print('Our mean absolute error is: ',MAE)

R2 = r2_score(y_test, y_pred) 
print('Our R2 score is: ', R2)

print('Our Root Mean Squared Error is:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
y_test.mean()

In [None]:
#Cecking Multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
pd.DataFrame({'vif': vif[0:]}, index=X_train.columns).T

In [None]:
#PLoting the residual
residual = y_test - y_pred
sns.distplot(residual)

In [None]:
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(residual, plot=ax, fit=True)

In [None]:
#Checking Homoscedacity
sns.scatterplot(y_pred, residual)

In [None]:
#Check No Autocorelation Residua
import statsmodels.tsa.api as smt

acf = smt.graphics.plot_acf(residual, lags=40 , alpha=0.05)
acf.show()


In [None]:
#Visualize The Actual Data and our Prediction
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
result.head(20)

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
ax.scatter(y_test, y_pred)
ax.plot([0,max(y_test)], [0,max(y_pred)], color='r')
fig.show()

**CONCLUSION ON THE LINEAR REGRESSION MODEL**

In [None]:
coeff_result = pd.DataFrame(reg.coef_, X.columns, columns=['Coefficient'])  
coeff_result