Data Preprocessing

In [None]:
import pandas as pd
import numpy as np


data = {
    'ID': ['001', '002', '003', '004', '005', '006', '007'],
    'Name': ['Ali', None, 'Reza', 'Maryam', 'Ali', 'Hossein', 'Nazanin'],
    'Age': ['27', 'twenty eight', '30', '29', '27', None, '25'],
    'Email': ['ali@gmail', 'sara@yahoo.com', 'REZA@GMAIL.COM', 'maryam@gmail.com', 'ali@gmail', 'hossein123@@mail.com', 'nazanin@outlook.com'],
    'PurchaseAmount': ['120.5', '89.99', 'NaN', '105', '120.5', '97.3', 'abc'],
    'SignupDate': ['2021/04/11', '04-12-2021', '2021.04.13', '2021/04/14', '2021/04/11', '2021-14-04', '2021/04/15']
}

df = pd.DataFrame(data)

df = df.drop_duplicates()


def convert_age(age):
    try:
        return int(age)
    except:
        if isinstance(age, str) and 'twenty eight' in age.lower():
            return 28
        return np.nan

df['Age'] = df['Age'].apply(convert_age)


df = df.dropna(subset=['Age'])


df['PurchaseAmount'] = pd.to_numeric(df['PurchaseAmount'], errors='coerce')
mean_purchase = df['PurchaseAmount'].mean()
df['PurchaseAmount'].fillna(mean_purchase, inplace=True)


def valid_email(email):
    if isinstance(email, str) and email.count('@') == 1 and ' ' not in email:
        return True
    return False

df = df[df['Email'].apply(valid_email)]


def parse_date(date_str):
    for fmt in ('%Y/%m/%d', '%m-%d-%Y', '%Y.%m.%d', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except:
            continue
    return pd.NaT

df['SignupDate'] = df['SignupDate'].apply(parse_date)
df = df.dropna(subset=['SignupDate'])

print(df)


    ID     Name   Age                Email  PurchaseAmount SignupDate
0  001      Ali  27.0            ali@gmail        120.5000 2021-04-11
1  002     None  28.0       sara@yahoo.com         89.9900 2021-04-12
2  003     Reza  30.0       REZA@GMAIL.COM        108.9975 2021-04-13
3  004   Maryam  29.0     maryam@gmail.com        105.0000 2021-04-14
4  005      Ali  27.0            ali@gmail        120.5000 2021-04-11
6  007  Nazanin  25.0  nazanin@outlook.com        108.9975 2021-04-15


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PurchaseAmount'].fillna(mean_purchase, inplace=True)


Binning

In [None]:

bins_age = [0, 26, 29, 100]
labels_age = ['Young', 'Middle', 'Old']
df['AgeBin'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age)

bins_purchase = [0, 90, 110, np.inf]
labels_purchase = ['Low', 'Medium', 'High']
df['PurchaseBin'] = pd.cut(df['PurchaseAmount'], bins=bins_purchase, labels=labels_purchase)

print(df[['Age', 'AgeBin', 'PurchaseAmount', 'PurchaseBin']])


    Age  AgeBin  PurchaseAmount PurchaseBin
0  27.0  Middle        120.5000        High
1  28.0  Middle         89.9900         Low
2  30.0     Old        108.9975      Medium
3  29.0  Middle        105.0000      Medium
4  27.0  Middle        120.5000        High
6  25.0   Young        108.9975      Medium


L_Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['NameEncoded'] = le.fit_transform(df['Name'])

df['SignupDayOfYear'] = df['SignupDate'].dt.dayofyear

X = df[['Age', 'NameEncoded', 'SignupDayOfYear']]
y = df['PurchaseAmount']

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
print("Predictions:", y_pred)


Predictions: [119.54387931  95.72672414 100.71112069 112.01155172 119.54387931
 106.44784483]


Cross-Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)
rmse_scores = np.sqrt(-scores)

print("RMSE for each fold:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation of RMSE:", rmse_scores.std())


RMSE for each fold: [19.60630328 62.386875   19.19596154  1.89051136 22.68613636]
Mean RMSE: 25.15315750821684
Standard deviation of RMSE: 19.999211949156553
