### Fundamental Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Loading data into data frame, Null values and Dropping columns.

In [None]:
df = pd.read_csv('')
df()
df.describe()


# Getting all coumn names
df.columns
df.columns.values

# Checking for null values [Count of all null values in each columns]
df.isnull().sum()


#Dropping rows with null values present in them
df.dropna(axis = 0)


# Dropping an entire column(If not required)
df.drop(['COLUMN NAME'], axis = 1)



### Replacing missing values with other values!

Mean, Median, Mode etc..

In [None]:
# Filling a column value with median vlaue
df['col1'] = df['col1'].fillna(df['col1'].median())



#filling multiple columns together
df[['col1', 'col2']] = df[['col1', 'col2']].fillna(df[['col1', 'col2']].median())



#filling all values with median
df = df.fillna(df.median())

### Dealing with Outliers!

In [None]:
#Distplot [ Distribution Plot of each column!]

sns.displot( df['COLUMN NAME'])



# Required quantile determination 'q'based on distplot
q1 = df['COLUMN NAME'].quantile(99)  
q2 = df['COLUMN NAME'].quantile(1)

# Setting data within range 1-99

df_no_outliers = df[q2<data_no_mv['Price']<q1]

###  Handling categorical data and dummy variables!

In [None]:
# To include the categorical data in the regression, let's create dummies
# There is a very convenient method called: 'get_dummies' which does that seemlessly
# It is extremely important that we drop one of the dummies, alternatively we will introduce multicollinearity
df_with_dummies = pd.get_dummies(df, drop_first=True)
df_with_dummies.head()

# TO REARRANGE THE COLUMNS IN DF!
columns = ['c1','c2','c3',....]   #Column names in desired order
df = df_with_dummies[columns]

#### Resetting Index

In [None]:
df_cleaned = df.reset_index(drop=True)
df_cleaned.head()

### Checking Relationship between two columns graphically for further analysis!

Multiple Scatter plots

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3)) #sharey -> share 'OUTPUT COLUMN' as y
ax1.scatter(data_cleaned['INPUT COLUMN 1'],data_cleaned['OUTPUT COLUMN'])
ax1.set_title('INPUT COLUMN 1 and OUPUT COLUMN')
ax2.scatter(data_cleaned['INPUT COLUMN 2'],data_cleaned['OUTPUT COLUMN'])
ax2.set_title('INPUT COLUMN 2 and OUPUT COLUMN')
ax3.scatter(data_cleaned['INPUT COLUMN 3'],data_cleaned['OUTPUT COLUMN'])
ax3.set_title('INPUT COLUMN 3 and OUPUT COLUMN')


# Here we wil get multiple plots with common Y axis and each plot with its relationship with dependent or input columns.

## Statistical Data transfomation

In [None]:
# Checking distribution of the COLUMN to be transformed.
sns.distplot(data_cleaned['COLUMN'])


# LOG transformation
log_column = np.log(data_cleaned['COLUMN'])

# Then we add LOG_COLUMN  to our data frame
data_cleaned['log_COLUMN'] = log_COLUMN
data_cleaned

# Dropping the orignal COLUMN if not required

data_cleaned.drop['COLUMN', axis = 1]

###  Feature scaling



In [None]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()
# Fit the inputs (calculate the mean and standard deviation feature-wise)
scaler.fit(inputs)
# Scale the features and store them in a new variable (the actual scaling procedure)
inputs_scaled = scaler.transform(inputs)

### Test Train split data

In [None]:
# Import the module for the split
from sklearn.model_selection import train_test_split

# Split the variables with an 80-20 split and some random state
# To have the same split as mine, use random_state = 365
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

### Create the regression

In [None]:
# Create a linear regression object
reg = LinearRegression()
# Fit the regression with the scaled TRAIN inputs and targets
reg.fit(x_train,y_train)

# Let's check the outputs of the regression
# I'll store them in y_hat as this is the 'theoretical' name of the predictions
y_hat = reg.predict(x_train)



# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_train, y_hat)
# Let's also name the axes
plt.xlabel('Targets (y_train)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()


# Obtain the bias (intercept) of the regression
reg.intercept_

# Obtain the weights (coefficients) of the regression
reg.coef_


# Create a regression summary where we can compare them with one-another
reg_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
reg_summary['Weights'] = reg.coef_
reg_summary




# Once we have trained and fine-tuned our model, we can proceed to testing it
# Testing is done on a dataset that the algorithm has never seen
# Luckily we have prepared such a dataset
# Our test inputs are 'x_test', while the outputs: 'y_test' 
# We SHOULD NOT TRAIN THE MODEL ON THEM, we just feed them and find the predictions
# If the predictions are far off, we will know that our model overfitted
y_hat_test = reg.predict(x_test)


# Create a scatter plot with the test targets and the test predictions
# You can include the argument 'alpha' which will introduce opacity to the graph
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_hat_test)',size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()




