**I will be mentioning all the important dependencies , we must know for ML**

In [None]:
# for array and matrices in python
import numpy as np
# for making a dataframe and preprocessing stuff
import pandas as pd
# for plotting operations
import matplotlib.pyplot as plt
import seaborn as sns
# for directly accessing the sklearn datasets
import sklearn.datasets
# xgbregressor for linear prediction purposes
from xgboost import XGBRegressor
# sklearn is an important ML python library
from sklearn.model_selection import train_test_split
# if our data is very random like 0.002 in one column and 501 in other we can standardise it to make better predictions
from sklearn.preprocessing import StandardScaler
# sorry we can import Logistic Regression model from sklearn like this
from sklearn.linear_model import LogisticRegression
# import the support vector machine , A support vector machine (SVM) is a supervised machine learning algorithm that classifies data by finding an optimal line or hyperplane that maximizes the distance between each class in an N-dimensional space.
from sklearn import svm
# for checking the accuracy score of our model
from sklearn import metrics
from sklearn.metrics import accuracy_score

**Here I'll describe the pandas library, and various pre-processing methods we can use**

In [None]:
# If we've our csv file uploaded in slider , we can just pass its path into the pandas to load that dataset into a pandas dataframe , for also including heading remove header=None
dataset = pd.read_csv('path/of/the/dataset',header=None)
# directly accessing the dataset from sklearn
import_dataset = sklearn.datasets.fetch_california_housing()
dataset = pd.DataFrame(import_dataset.data,columns=import_dataset.feature_names)
# add the target array or the house prices to the dataframe
house_price_dataframe['price'] = house_price_dataset.target

# for getting the first five rows of the dataset , and if you pass a number in the parenthesis , this will print that many no. of rows
dataset.head()
# for getting the shape of the data
dataset.shape
# for getting the statistical parameters of the dataset - count,mean,std,min and soon
dataset.describe()
# for getting the different categories present in a particular column -> a good dataset has a comparable count of each category
dataset['legend of that column'].value_counts()
# for finding the mean for each category , we can use
dataset.groupby('legend of that column').mean()
# if any column is of no use , we can drop it using  ->  axis=1 for column and 0 for row
dataset.drop(columns='legend name',axis=1)
# for checking any missing values in the dataset
dataset.isnull().sum()
# only a few data are having missing values , so we'll just replace them with null/mean of other values
# dropping all the missing values only
dataset = dataset.dropna()
# for encoding a column that has non-numerical categories
dataset.replace({'legend':{'Category1':0,'Category2':1, similarly for other columns if any}},inplace=True)
# replace the 3+ in dependents to 4 to make prediction easier
loan_dataset = loan_dataset.replace(to_replace='3+',value=4)



# Standardising the data , some values are very big and others very small , so we'll bring everything to the common ground using standardisation , fit transform on only train DS , on transform on test then
scaler = StandardScaler()
scaler.fit(x)
standardised_data = scaler.transform(x)
# for splitting the data into test and train -> test size is the ratio of test dataset to train dataset , stratify=Y will make almost equal partitions of both categories in test as well as train,random_state should be same for similar dataset splitting of two users
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)


**Correlation**

In [None]:
# Understanding the correlation between the various features in the dataset
# 1) positive correlation , both change simultaneusly +1
# 2) negative correlation , both change opposingly -1

correlation = dataset.corr()
# plotting a heatmap to understand the correlation
plt.figure(figsize=(10,10))
sns.heatmap(correlation,cbar=True,square=True,fmt='.2f',annot=True,annot_kws={'size':8},cmap='Blues')

**See how I will load the model and fit data in it**

In [None]:
# we loaded the logisticRegression model here -> which is used for classification purposes
model = LogisticRegression()
# now training the model on our X_train and Y_train
model.fit(X_train,Y_train)
# for checking accuracy on training data , similarly we can do for test dataset and remember this is for category prediction model only
train_data_prediction = model.predict(X_train)
train_data_accuracy = accuracy_score(X_train,Y_train)



# making the training the data using the support vector machine classifier
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train,Y_train)



# making and training the model on training set
regressor = XGBRegressor()
regressor.fit(X_train,Y_train)
# Accuracy of the model on training set? No , we can't use that here , instead we'll calculate the errors
training_data_prediction = regressor.predict(X_train)
# R squared error
score1 = metrics.r2_score(Y_train,training_data_prediction)
# Mean squared error
score2 = metrics.mean_absolute_error(Y_train,training_data_prediction)
# these values should be as low as possible <1 and similarly we can calculate for test dataset


**How can we make a Predictive System of our ML model**

In [None]:
# Making a predictive system , given the input data
input_data = (0.0192,0.0607,0.0378,0.0774,0.1388,0.0809,0.0568,0.0219,0.1037,0.1186,0.1237,0.1601,0.3520,0.4479,0.3769,0.5761,0.6426,0.6790,0.7157,0.5466,0.5399,0.6362,0.7849,0.7756,0.5780,0.4862,0.4181,0.2457,0.0716,0.0613,0.1816,0.4493,0.5976,0.3785,0.2495,0.5771,0.8852,0.8409,0.3570,0.3133,0.6096,0.6378,0.2709,0.1419,0.1260,0.1288,0.0790,0.0829,0.0520,0.0216,0.0360,0.0331,0.0131,0.0120,0.0108,0.0024,0.0045,0.0037,0.0112,0.0075)
# changing the input_data to a numpy_array , to increase the efficiency and make it handy
input_data_as_numpy_array = np.asarray(input_data)
# reshape the numpy array as we're predicting for one instance only
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]=='R') :
    print("The object is a Rock!")
else :
    print("The object is a Mine!")

**Plotting stuff with Matplotlib and seaborn**

In [None]:
# Visualising the actual & predicted prices
plt.scatter(Y_train,training_data_prediction)
plt.xlabel('Actual prices',color='b')
plt.ylabel('Predicted prices',color='r')
plt.title('Predicted versus Actual values')
plt.show()

# if there are categories in column we'll plot countplot else we'll plot displot->distribution plot
sns.countplot(x='Education',hue='Loan_Status',data=loan_dataset)