In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
netflix_data_df = pd.read_parquet("C:\\Users\\saigi\\PycharmProjects\\PySpark_Project\\target_data")
netflix_data_df.head()

Unnamed: 0,show_id,type,title,director,cast,date_added,release_year,rating,movie_duration,seasons,...,released_countries,released_languages,movie_type,show_type,genres,added_date,year,month,day,day_of_week
0,s3435,Movie,The Influence,Dennis Rovira van Boekholt,"Manuela Vellés, Maggie Civantos, Alain Hernánd...","October 11, 2019",2019,TV-MA,102.0,,...,"[Spain, France]","[Spanish, French]",Global,,"[Horror Movies, International Movies]",2019-10-11,2019,October,11,Friday
1,s4067,Movie,Your Son,Miguel Ángel Vivas,"José Coronado, Ana Wagener, Asia Ortega, Pol M...","March 1, 2019",2018,TV-MA,103.0,,...,"[Spain, France]","[Spanish, French]",Global,,"[Dramas, International Movies, Thrillers]",2019-03-01,2019,March,1,Friday
2,s4068,Movie,Quién te cantará,Carlos Vermut,"Najwa Nimri, Eva Llorach, Carme Elias, Natalia...","February 28, 2019",2018,TV-MA,125.0,,...,"[Spain, France]","[Spanish, French]",Global,,"[Dramas, Independent Movies, International Mov...",2019-02-28,2019,February,28,Thursday
3,s4468,Movie,Gun City,Dani de la Torre,"Luis Tosar, Michelle Jenner, Vicente Romero, M...","October 31, 2018",2018,TV-MA,126.0,,...,"[Spain, France]","[Spanish, French]",Global,,"[Dramas, International Movies, Thrillers]",2018-10-31,2018,October,31,Wednesday
4,s4513,Movie,Errementari: The Blacksmith and the Devil,Paul Urkijo Alijo,"Kandido Uranga, Uma Bracaglia, Eneko Sagardoy,...","October 12, 2018",2018,TV-MA,99.0,,...,"[Spain, France]","[Spanish, French]",Global,,"[Horror Movies, International Movies, Sci-Fi &...",2018-10-12,2018,October,12,Friday


In [3]:
categorical_cols = ['type', 'rating' , 'movie_type', 'show_type']
netflix_data_df[categorical_cols].isnull().sum()

type             0
rating           0
movie_type    2051
show_type     5421
dtype: int64

In [4]:
# Multi-Label Binarization (for genres, countries, languages)
mlb_cols = ['genres', 'released_countries', 'released_languages']

for col in mlb_cols:
    mlb = MultiLabelBinarizer()
    mlb_data = mlb.fit_transform(netflix_data_df[col])
    mlb_df = pd.DataFrame(mlb_data, columns=[f"{col}_{c}" for c in mlb.classes_])
    df = pd.concat([netflix_data_df, mlb_df], axis=1)
    df.drop(columns=col, inplace=True)
mlb_df.head()

Unnamed: 0,released_languages_Afrikaans,released_languages_Albanian,released_languages_Amharic,released_languages_Arabic,released_languages_Armenian,released_languages_Azerbaijani,released_languages_Belarusian,released_languages_Bengali,released_languages_Berber,released_languages_Bulgarian,...,released_languages_Somali,released_languages_Spanish,released_languages_Swahili,released_languages_Swedish,released_languages_Tamil,released_languages_Thai,released_languages_Turkish,released_languages_Ukrainian,released_languages_Xhosa,released_languages_Zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
mlb_df.head()

Unnamed: 0,released_languages_Afrikaans,released_languages_Albanian,released_languages_Amharic,released_languages_Arabic,released_languages_Armenian,released_languages_Azerbaijani,released_languages_Belarusian,released_languages_Bengali,released_languages_Berber,released_languages_Bulgarian,...,released_languages_Somali,released_languages_Spanish,released_languages_Swahili,released_languages_Swedish,released_languages_Tamil,released_languages_Thai,released_languages_Turkish,released_languages_Ukrainian,released_languages_Xhosa,released_languages_Zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
import pandas as pd

def feature_engineering_dummies(netflix_df_pd):
    """Feature engineering using pd.get_dummies."""

    df = netflix_df_pd.copy()

    # 1. Categorical Features (using pd.get_dummies)

    categorical_cols = ['type', 'rating', 'movie_type', 'show_type']

    for col in categorical_cols:
      df = pd.get_dummies(df, columns=[col], prefix=col, dtype=int, dummy_na=False) # dtype=int


    # 2. Multi-Label Binarization (for genres, countries, languages)

    mlb_cols = ['genres', 'released_countries', 'released_languages']

    for col in mlb_cols:
        mlb = MultiLabelBinarizer()
        mlb_data = mlb.fit_transform(df[col])
        mlb_df = pd.DataFrame(mlb_data, columns=[f"{col}_{c}" for c in mlb.classes_], dtype=int)  # dtype=int
        df = pd.concat([df, mlb_df], axis=1)
        df.drop(columns=col, inplace=True)  # Drop original multi-label columns

    # 3. Numerical Features (Scaling)

    numerical_cols = ['movie_duration', 'seasons', 'release_year']
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # 4. Text Features (TF-IDF)

    text_col = 'description'
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_data = tfidf.fit_transform(df[text_col]).toarray()
    tfidf_df = pd.DataFrame(tfidf_data, columns=[f"description_tfidf_{i}" for i in range(tfidf_data.shape[1])])
    df = pd.concat([df, tfidf_df], axis=1)
    df.drop(columns=text_col, inplace=True)

    # 5. Date Features

    df['added_date'] = pd.to_datetime(df['added_date'])
    df['added_year'] = df['added_date'].dt.year
    df['added_month'] = df['added_date'].dt.month
    df['added_day'] = df['added_date'].dt.day
    df['added_dayofweek'] = df['added_date'].dt.dayofweek
    df.drop(columns='added_date', inplace=True)

    # 6. Imputation (if needed)

    imputer = SimpleImputer(strategy='median')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)  # Apply imputation

    return df

In [7]:
df_engineered = feature_engineering_dummies(netflix_data_df)
print(df_engineered.head())

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 's3435'

In [None]:
df_engineered.head()