In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

csv_path = r'D:\Downloads\Mall_Customers.csv'
df = pd.read_csv(csv_path)

print(df.head()) 
print(df.tail())
print(df.info())  
print(df.describe()) 

print(df.isna().sum())

pd.set_option("display.max_rows", 100)
print(df.head(100))

df.rename(columns={
    'CustomerID': 'customer_id',
    'Gender': 'gender',
    'Age': 'age',
    'Annual Income (k$)': 'annual_income',
    'Spending Score (1-100)': 'spending_score'
}, inplace=True)

df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df.set_index('customer_id', inplace=True) 
df.index.name = 'customer_id'
df.index = pd.to_datetime(df.index)
df.reset_index(inplace=True) 

df_melted = df.melt(id_vars=['customer_id'], value_vars=['age', 'annual_income', 'spending_score'])
df_pivot = df.pivot(index='customer_id', columns='gender', values='spending_score')
print(df_melted.head())
print(df_pivot.head())

df1 = df[['customer_id', 'gender', 'age']]
df2 = df[['customer_id', 'annual_income', 'spending_score']]

df_merged = df1.merge(df2, on='customer_id', how='inner') 


df_concat = pd.concat([df1, df2], axis=1)  
df['age_group'] = df['age'].apply(lambda x: 'Young' if x < 30 else 'Old')

df_numeric = df[['annual_income', 'spending_score']]
df_scaled = df_numeric.applymap(lambda x: x / 100)
filtered_df = df.query('age > 40 and spending_score > 60')
grouped_df = df.groupby('gender').agg({
    'age': 'mean',
    'annual_income': 'mean',
    'spending_score': ['min', 'max', 'mean']  
})

grouped_df.reset_index(inplace=True)
print(grouped_df)
agg_df = df.agg({
    'age': ['min', 'max', 'mean'],
    'annual_income': ['min', 'max', 'mean'],
    'spending_score': ['min', 'max', 'mean']
})
print(agg_df)

df_dropped = df.dropna()
df_filled = df.fillna(0) 

df_filled_mean = df.fillna(df.mean(numeric_only=True))

imputer = SimpleImputer(strategy='mean')
df[['annual_income', 'spending_score']] = imputer.fit_transform(df[['annual_income', 'spending_score']])
imputer_median = SimpleImputer(strategy='median')
df[['annual_income', 'spending_score']] = imputer_median.fit_transform(df[['annual_income', 'spending_score']])


   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40
     CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
195         196  Female   35                 120                      79
196         197  Female   45                 126                      28
197         198    Male   32                 126                      74
198         199    Male   32                 137                      18
199         200    Male   30                 137                      83
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non

In [23]:
import pandas as pd
import re

csv_path =  r'D:\Downloads\Mall_Customers.csv'
df = pd.read_csv(csv_path)

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

df['CustomerID'] = df['CustomerID'].apply(clean_text)
df['Gender'] = df['Gender'].apply(clean_text)
df['Age'] = df['Age'].apply(clean_text)
df['Annual Income (k$)'] = df['Annual Income (k$)'].apply(clean_text)
df['Spending Score (1-100)'] = df['Spending Score (1-100)'].apply(clean_text)

print(df.head())


   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


In [11]:
import pandas as pd
import re
import json

url = "https://raw.githubusercontent.com/nnqomariyah/Fundamentals_of_Data_Science/main/week_3/data/movies_metadata.csv"
df = pd.read_csv(url, low_memory=False)

def clean_text(text):
    text = str(text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df = df.applymap(lambda x: clean_text(x) if isinstance(x, str) else x)

def json_flatten(text):
    try:
        json_data = json.loads(text.replace("'", "\""))
        return ', '.join([item['name'] for item in json_data])
    except:
        return text

if 'genres' in df.columns:
    df['genres'] = df['genres'].apply(json_flatten)

print(df.head())
df.to_csv('cleaned_movies_metadata.csv', index=False)


   adult                              belongs_to_collection    budget  \
0  False  id 10194 name Toy Story Collection posterpath ...  30000000   
1  False                                                NaN  65000000   
2  False  id 119050 name Grumpy Old Men Collection poste...         0   
3  False                                                NaN  16000000   
4  False  id 96871 name Father of the Bride Collection p...         0   

                                              genres  \
0  id 16 name Animation id 35 name Comedy id 1075...   
1  id 12 name Adventure id 14 name Fantasy id 107...   
2            id 10749 name Romance id 35 name Comedy   
3  id 35 name Comedy id 18 name Drama id 10749 na...   
4                                  id 35 name Comedy   

                        homepage     id    imdb_id original_language  \
0  httptoystorydisneycomtoystory    862  tt0114709                en   
1                            NaN   8844  tt0113497                en   
2       