In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import zscore
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datetime import datetime
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = {
    'Age': [25, 30, np.nan, 35, 40, 200, 29, 30],  # Includes an outlier
    'Salary': [50000, 54000, 58000, np.nan, 65000, 70000, 72000, 50000],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Chicago', 'Los Angeles', np.nan, 'New York', 'Los Angeles'],
    'Review': ['The product is great!', 'Not bad at all', 'Worst experience', np.nan, 'Amazing!', 'Horrible!', 'Good value for money', 'Would not recommend'],
    'Date': ['2023-01-01', '2023-02-15', '2023-03-20', '2023-04-05', '2023-05-10', '2023-06-15', '2023-07-01', '2023-08-20']
}
df = pd.DataFrame(data)
df['Date'] = pd.to_datetime(df['Date'])

In [3]:
df.head()

Unnamed: 0,Age,Salary,City,Review,Date
0,25.0,50000.0,New York,The product is great!,2023-01-01
1,30.0,54000.0,Los Angeles,Not bad at all,2023-02-15
2,,58000.0,Chicago,Worst experience,2023-03-20
3,35.0,,Chicago,,2023-04-05
4,40.0,65000.0,Los Angeles,Amazing!,2023-05-10


In [None]:
#1. Handling mssing data
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df['Salary'] = imputer.fit_transform(df[['Salary']])
df['City'].fillna(df['City'].mode()[0], inplace=True)
df['Review'].fillna("No review", inplace=True)

In [7]:
df.head()

Unnamed: 0,Age,Salary,City,Review,Date
0,25.0,50000.0,New York,The product is great!,2023-01-01
1,30.0,54000.0,Los Angeles,Not bad at all,2023-02-15
2,55.571429,58000.0,Chicago,Worst experience,2023-03-20
3,35.0,59857.142857,Chicago,No review,2023-04-05
4,40.0,65000.0,Los Angeles,Amazing!,2023-05-10


In [None]:

#2. Removing Duplicates
df.drop_duplicates(inplace=True)

In [None]:
#  3. Handling Outliers (Using Z-score)
z_scores = np.abs(zscore(df[['Age', 'Salary']]))
df = df[(z_scores < 3).all(axis=1)]

In [10]:
z_scores

Unnamed: 0,Age,Salary
0,0.552727,1.235942
1,0.462328,0.7344
2,0.0,0.232859
3,0.371929,0.0
4,0.281529,0.644839
5,2.611248,1.271766
6,0.480408,1.522537
7,0.462328,1.235942


In [None]:
# 4. Encoding Categorical Data
encoder = OneHotEncoder(sparse_output=False)
cities_encoded = encoder.fit_transform(df[['City']])
cities_df = pd.DataFrame(cities_encoded, columns=encoder.get_feature_names_out(['City']))
df = pd.concat([df, cities_df], axis=1).drop(columns=['City'])

In [12]:
df

Unnamed: 0,Age,Salary,Review,Date,City_Chicago,City_Los Angeles,City_New York
0,25.0,50000.0,The product is great!,2023-01-01,0.0,0.0,1.0
1,30.0,54000.0,Not bad at all,2023-02-15,0.0,1.0,0.0
2,55.571429,58000.0,Worst experience,2023-03-20,1.0,0.0,0.0
3,35.0,59857.142857,No review,2023-04-05,1.0,0.0,0.0
4,40.0,65000.0,Amazing!,2023-05-10,0.0,1.0,0.0
5,200.0,70000.0,Horrible!,2023-06-15,0.0,1.0,0.0
6,29.0,72000.0,Good value for money,2023-07-01,0.0,0.0,1.0
7,30.0,50000.0,Would not recommend,2023-08-20,0.0,1.0,0.0


In [13]:
# 5. Feature Scaling
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

In [14]:
df

Unnamed: 0,Age,Salary,Review,Date,City_Chicago,City_Los Angeles,City_New York
0,-0.552727,-1.235942,The product is great!,2023-01-01,0.0,0.0,1.0
1,-0.462328,-0.7344,Not bad at all,2023-02-15,0.0,1.0,0.0
2,0.0,-0.232859,Worst experience,2023-03-20,1.0,0.0,0.0
3,-0.371929,0.0,No review,2023-04-05,1.0,0.0,0.0
4,-0.281529,0.644839,Amazing!,2023-05-10,0.0,1.0,0.0
5,2.611248,1.271766,Horrible!,2023-06-15,0.0,1.0,0.0
6,-0.480408,1.522537,Good value for money,2023-07-01,0.0,0.0,1.0
7,-0.462328,-1.235942,Would not recommend,2023-08-20,0.0,1.0,0.0


In [16]:
df['Salary'].mean()

2.7755575615628914e-16

In [None]:
# 6. Feature Selection (Removing Low Variance Features)
selector = VarianceThreshold(threshold=0.1)
df = pd.DataFrame(selector.fit_transform(df), columns=df.columns[selector.get_support()])


In [19]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df.drop(columns=['Date'], inplace=True)

In [20]:
df

Unnamed: 0,Age,Salary,Review,City_Chicago,City_Los Angeles,City_New York,Year,Month,Day,DayOfWeek
0,-0.552727,-1.235942,The product is great!,0.0,0.0,1.0,2023,1,1,6
1,-0.462328,-0.7344,Not bad at all,0.0,1.0,0.0,2023,2,15,2
2,0.0,-0.232859,Worst experience,1.0,0.0,0.0,2023,3,20,0
3,-0.371929,0.0,No review,1.0,0.0,0.0,2023,4,5,2
4,-0.281529,0.644839,Amazing!,0.0,1.0,0.0,2023,5,10,2
5,2.611248,1.271766,Horrible!,0.0,1.0,0.0,2023,6,15,3
6,-0.480408,1.522537,Good value for money,0.0,0.0,1.0,2023,7,1,5
7,-0.462328,-1.235942,Would not recommend,0.0,1.0,0.0,2023,8,20,6
