<a href="https://colab.research.google.com/github/ehsan74814/Preprocessing_Data/blob/main/GradientDescent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**By default, the regression uses the normal equation, but if the features increase, it switches to it batch gradient decent.**

**For model design :**
**Gradient Decent :**

**1) Batch Gradient Descent =====> model = LinearRegression**

**2) Stocastic gradient Descent ====> model = SGDRegressor()**

**3) mini-batch gradient Descent ====> model = SGDRegressor(average=True)**


In [None]:
# Reading the dataset and checking the general information
df = pd.read_csv("/content/drive/MyDrive/Training ML/GradientDescent/student-dataset.csv")
df.head()

**.** Portfolio Rating === > ارزیابی جامع و چند بعدی از پیشرفت
**.** Coverletter Rating ===> پیش بینی موفقیت در دریافت یک شغل
**.** Refletter Rating ==> پیش بینی موفقیت در استخدام یا پذیرش تحصیلی

In [4]:
df.columns = df.columns.str.lower()

In [None]:
# Display data type and possible empty values
df.info()

In [None]:
# Display a summary of descriptive statistics
df.describe()

In [5]:
# missing Values
df.isnull().sum()

# Remove Null
df.drop(columns=["ethnic.group", "id"], inplace = True)

In [None]:
#Duplicate Values
df.duplicated().sum()

In [None]:
# Plot outliers
import matplotlib.pyplot as plt


numeric_columns = df.select_dtypes(include=["number"]).columns

num_columns = len(numeric_columns)
num_rows = (num_columns + 1 ) // 2
plt.figure(figsize=(10, num_rows * 4))


for i, column in enumerate(numeric_columns):
    plt.subplot(num_rows, 2, i + 1)
    plt.boxplot(df[column])
    plt.title(f"Boxplot for {column}")
    plt.ylabel(column)

plt.tight_layout()
plt.show()

In [None]:
# Identify outliner
def Identify_outliner(df):
  numeric_columns = df.select_dtypes(include=["number"]).columns
  outliner_summery = {}

  for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3-Q1

    lb = Q1 - 1.5 * IQR
    ub = Q3 +  1.5 * IQR

    outliners = df[(df[column] < lb) | (df[column] > ub)]

    outliner_summery[column] = outliners.shape[0]

    return outliner_summery


outliner_report = Identify_outliner(df)


for column, count in outliner_report.items():
  print(f"ستون {column} شامل {count}  داده پرت است")

In [None]:
def remove_outlinera(df):
  for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3-Q1

    lb = Q1 - 1.5 * IQR
    ub = Q3 +  1.5 * IQR


    delete_outliners = df[(df[column] < lb) | (df[column] > ub)]

    return delete_outliners



data_cleaned = remove_outlinera(df)
print(data_cleaned.describe())

In [9]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_columns = df.select_dtypes(include=["number"]).columns

df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [10]:
df = df.drop(columns = ["name", "nationality",	"city",	"latitude",	"longitude",	"gender",	"age"], inplace = False)

# Visualization

In [None]:
# Scatter plot between math score and language score
plt.figure(figsize=(10,6))
plt.scatter(df["math.grade"], df["language.grade"], color="blue")
plt.title("Scatter plot between math score and language score")
plt.xlabel("math Score")
plt.ylabel("language Score")
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Bar chart to compare course scores
plt.figure(figsize=(10,6))
df[["english.grade",	"math.grade",	"sciences.grade",	"language.grade"]].mean().plot(kind="bar", color="orange")
plt.title("Average grades of courses")
plt.xlabel("cource")
plt.ylabel("mean scores")
plt.xticks(rotation=45)
plt.grid(axis="y")
plt.legend()
plt.show()

In [None]:
# Box plot for distribution of scores
plt.figure(figsize=(10,6))
sns.boxplot(data= df[['english.grade',  'math.grade', 'sciences.grade', 'language.grade']])
plt.title("Box diagram of the distribution of course grades")
plt.ylabel("scores")
plt.grid(axis="y")
plt.legend()
plt.show()

In [None]:
# Correlation diagram
plt.figure(figsize=(10,6))
correlation_matrix = df.corr()
mask = np.triu(np.ones_like(correlation_matrix,dtype=bool), k=-2)
sns.heatmap(correlation_matrix, annot = True, cmap="coolwarm", fmt=".2f", square=True, mask=mask, linewidths="1", linecolor="black",cbar=True, cbar_kws={"shrink":.8})
plt.title("Correlation diagram of features")
plt.show()

# Test and Train

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["refletter.rating"])
y = df["refletter.rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error
model = SGDRegressor(max_iter=10000, tol=1e-6, learning_rate="constant", eta0=0.01).fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
m = model.coef_
b = model.intercept_

In [None]:
r2_score(y_test, y_pred)
mean_squared_error(y_test, y_pred)

In [None]:
# Real Data
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred, color="blue", label="Forecast")


plt.plot([1,5], [1,5], "--", color="red", label="linear regression")


plt.xlabel('X values')
plt.ylabel('y value')
plt.title('Gradient Decent')
plt.legend(['Real data','Estimated line'])
# plt.xlim(1,5)
# plt.ylim(1,5)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#