# Informationen für dieses Notebook

Dieses .ipnynb dient dazu, den Originaldatensatz "kaggle_original_dataset.csv" zu laden, explorativ zu erkunden und erste Bereinigungen vor zu nehmen. Der Überarbeitete Datensatz wird als "df_movie_for_streamlit.csv" im Ordner "Datasets" gespeichert.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
%matplotlib inline
sns.set_theme(style="darkgrid", palette="bright", context="paper")

In [3]:
df_movies = pd.read_csv("kaggle_original_dataset.csv")

In [4]:
df_movies.shape

(33600, 23)

In [5]:
df_movies.isnull().sum()  # Anzahl fehlender Werte pro Spalte

id                           0
Title                        0
Movie Link                   0
Year                         0
Duration                   221
MPA                       7976
Rating                     138
Votes                      138
budget                   21785
grossWorldWide           15378
gross_US_Canada          16029
opening_weekend_Gross    18077
directors                  359
writers                   1576
stars                      473
genres                     382
countries_origin           366
filming_locations         6729
production_companies      1378
Languages                  474
wins                         0
nominations                  0
oscars                       0
dtype: int64

In [6]:
df_movies.duplicated().sum()  # Anzahl der doppelten Zeilen anzeigen

0

In [7]:
df_movies.columns

Index(['id', 'Title', 'Movie Link', 'Year', 'Duration', 'MPA', 'Rating',
       'Votes', 'budget', 'grossWorldWide', 'gross_US_Canada',
       'opening_weekend_Gross', 'directors', 'writers', 'stars', 'genres',
       'countries_origin', 'filming_locations', 'production_companies',
       'Languages', 'wins', 'nominations', 'oscars'],
      dtype='object')

In [8]:
df_movies.nunique()

id                       33600
Title                    31935
Movie Link               33600
Year                        65
Duration                   230
MPA                         26
Rating                      86
Votes                     1758
budget                    1140
grossWorldWide           18033
gross_US_Canada          17211
opening_weekend_Gross    14751
directors                14520
writers                  27123
stars                    32812
genres                    8540
countries_origin          2938
filming_locations        12383
production_companies     25940
Languages                 2709
wins                         1
nominations                220
oscars                      12
dtype: int64

In [9]:
# Alle Spalten anzeigen
pd.set_option('display.max_columns', None)

In [10]:
df_movies.head(5)

Unnamed: 0,id,Title,Movie Link,Year,Duration,MPA,Rating,Votes,budget,grossWorldWide,gross_US_Canada,opening_weekend_Gross,directors,writers,stars,genres,countries_origin,filming_locations,production_companies,Languages,wins,nominations,oscars
0,tt0073195,Jaws,https://www.imdb.com/title/tt0073195,1975,2h 4m,PG,8.1,683K,7000000.0,477220580.0,266567580.0,7061513.0,['Steven Spielberg'],"['Peter Benchley', 'Carl Gottlieb']","['Roy Scheider', 'Robert Shaw', 'Richard Dreyf...","['Monster Horror', 'Sea Adventure', 'Survival'...",['United States'],"[""Water Street, Edgartown, Martha's Vineyard, ...","['Zanuck/Brown Productions', 'Universal Pictur...",['English'],0,20,0
1,tt0073629,The Rocky Horror Picture Show,https://www.imdb.com/title/tt0073629,1975,1h 40m,R,7.4,173K,1200000.0,115798478.0,112892319.0,,['Jim Sharman'],"[""Richard O'Brien"", 'Jim Sharman']","['Tim Curry', 'Susan Sarandon', 'Barry Bostwick']","['Dark Comedy', 'Raunchy Comedy', 'Rock Musica...","['United Kingdom', 'United States']","['Oakley Court, Windsor Road, Oakley Green, Wi...","['Twentieth Century Fox', 'Michael White Produ...",['English'],0,4,0
2,tt0073486,One Flew Over the Cuckoo's Nest,https://www.imdb.com/title/tt0073486,1975,2h 13m,R,8.7,1.1M,3000000.0,109115366.0,108981275.0,,['Milos Forman'],"['Lawrence Hauben', 'Bo Goldman', 'Ken Kesey']","['Jack Nicholson', 'Louise Fletcher', 'Michael...","['Medical Drama', 'Psychological Drama', 'Drama']",['United States'],['Oregon State Mental Hospital - 2600 Center S...,"['Fantasy Films', 'N.V. Zvaluw']",['English'],0,15,0
3,tt0072890,Dog Day Afternoon,https://www.imdb.com/title/tt0072890,1975,2h 5m,R,8.0,279K,1800000.0,50002721.0,50000000.0,,['Sidney Lumet'],"['Frank Pierson', 'P.F. Kluge', 'Thomas Moore']","['Al Pacino', 'John Cazale', 'Penelope Allen']","['Heist', 'True Crime', 'Biography', 'Crime', ...",['United States'],"['285 Prospect Park West, Brooklyn, New York C...","['Warner Bros.', 'Artists Entertainment Complex']",['English'],0,20,0
4,tt0073692,Shampoo,https://www.imdb.com/title/tt0073692,1975,1h 50m,R,6.4,15K,4000000.0,49407734.0,49407734.0,,['Hal Ashby'],"['Robert Towne', 'Warren Beatty']","['Warren Beatty', 'Julie Christie', 'Goldie Ha...","['Satire', 'Comedy', 'Drama']",['United States'],"['2270 Bowmont Drive, Beverly Hills, Californi...","['Persky-Bright / Vista', 'Columbia Pictures',...",['English'],0,11,0


In [11]:
df_movies.dtypes

id                        object
Title                     object
Movie Link                object
Year                       int64
Duration                  object
MPA                       object
Rating                   float64
Votes                     object
budget                   float64
grossWorldWide           float64
gross_US_Canada          float64
opening_weekend_Gross    float64
directors                 object
writers                   object
stars                     object
genres                    object
countries_origin          object
filming_locations         object
production_companies      object
Languages                 object
wins                       int64
nominations                int64
oscars                     int64
dtype: object

### Anpassungen der Datentypen

In [12]:
# DataFrame anpassen (angenommen df ist dein DataFrame)
def clean_dataframe(df):
    # 1. Duration in Minuten umrechnen
    def duration_to_minutes(duration):
        if isinstance(duration, str):
            hours = int(duration.split('h')[0]) if 'h' in duration else 0
            minutes = int(duration.split('m')[0].split()[-1]) if 'm' in duration else 0
            return hours * 60 + minutes
        return None

    df['Duration'] = df['Duration'].apply(duration_to_minutes)

    # 2. Votes bereinigen (z. B. "683K" -> 683000)
    def votes_to_int(votes):
        if isinstance(votes, str):
            votes = votes.replace('K', '000').replace('M', '000000')
            return int(float(votes))
        return None

    df['Votes'] = df['Votes'].apply(votes_to_int)

    # 3. Listen-Spalten sauber formatieren
    list_columns = ['writers', 'stars', 'genres', 'countries_origin',
                    'filming_locations', 'production_companies']
    for col in list_columns:
        df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else x)

    # 4. Optional: Strings explizit als string speichern
    string_columns = ['id', 'Title', 'Movie Link', 'MPA', 'Languages']
    for col in string_columns:
        df[col] = df[col].astype('string')

    return df

# Bereinigten DataFrame speichern
df_movies_cleaned = clean_dataframe(df_movies)

# Datentypen prüfen
print(df_movies_cleaned.dtypes)

id                       string[python]
Title                    string[python]
Movie Link               string[python]
Year                              int64
Duration                        float64
MPA                      string[python]
Rating                          float64
Votes                           float64
budget                          float64
grossWorldWide                  float64
gross_US_Canada                 float64
opening_weekend_Gross           float64
directors                        object
writers                          object
stars                            object
genres                           object
countries_origin                 object
filming_locations                object
production_companies             object
Languages                string[python]
wins                              int64
nominations                       int64
oscars                            int64
dtype: object


In [13]:
df_movies_cleaned.head(5)

Unnamed: 0,id,Title,Movie Link,Year,Duration,MPA,Rating,Votes,budget,grossWorldWide,gross_US_Canada,opening_weekend_Gross,directors,writers,stars,genres,countries_origin,filming_locations,production_companies,Languages,wins,nominations,oscars
0,tt0073195,Jaws,https://www.imdb.com/title/tt0073195,1975,124.0,PG,8.1,683000.0,7000000.0,477220580.0,266567580.0,7061513.0,['Steven Spielberg'],"[Peter Benchley, Carl Gottlieb]","[Roy Scheider, Robert Shaw, Richard Dreyfuss]","[Monster Horror, Sea Adventure, Survival, Adve...",[United States],"[Water Street, Edgartown, Martha's Vineyard, M...","[Zanuck/Brown Productions, Universal Pictures]",['English'],0,20,0
1,tt0073629,The Rocky Horror Picture Show,https://www.imdb.com/title/tt0073629,1975,100.0,R,7.4,173000.0,1200000.0,115798478.0,112892319.0,,['Jim Sharman'],"[Richard O'Brien, Jim Sharman]","[Tim Curry, Susan Sarandon, Barry Bostwick]","[Dark Comedy, Raunchy Comedy, Rock Musical, Su...","[United Kingdom, United States]","[Oakley Court, Windsor Road, Oakley Green, Win...","[Twentieth Century Fox, Michael White Producti...",['English'],0,4,0
2,tt0073486,One Flew Over the Cuckoo's Nest,https://www.imdb.com/title/tt0073486,1975,133.0,R,8.7,1.0,3000000.0,109115366.0,108981275.0,,['Milos Forman'],"[Lawrence Hauben, Bo Goldman, Ken Kesey]","[Jack Nicholson, Louise Fletcher, Michael Berr...","[Medical Drama, Psychological Drama, Drama]",[United States],[Oregon State Mental Hospital - 2600 Center St...,"[Fantasy Films, N.V. Zvaluw]",['English'],0,15,0
3,tt0072890,Dog Day Afternoon,https://www.imdb.com/title/tt0072890,1975,125.0,R,8.0,279000.0,1800000.0,50002721.0,50000000.0,,['Sidney Lumet'],"[Frank Pierson, P.F. Kluge, Thomas Moore]","[Al Pacino, John Cazale, Penelope Allen]","[Heist, True Crime, Biography, Crime, Drama, T...",[United States],"[285 Prospect Park West, Brooklyn, New York Ci...","[Warner Bros., Artists Entertainment Complex]",['English'],0,20,0
4,tt0073692,Shampoo,https://www.imdb.com/title/tt0073692,1975,110.0,R,6.4,15000.0,4000000.0,49407734.0,49407734.0,,['Hal Ashby'],"[Robert Towne, Warren Beatty]","[Warren Beatty, Julie Christie, Goldie Hawn]","[Satire, Comedy, Drama]",[United States],"[2270 Bowmont Drive, Beverly Hills, California...","[Persky-Bright / Vista, Columbia Pictures, Rub...",['English'],0,11,0


In [14]:
# DataFrame speichern
df_movies_cleaned.to_csv('../Datasets/df_movie_for_streamlit.csv', index=False)


print("Das DataFrame wurde erfolgreich als 'df_movie_for_streamlit.csv' gespeichert.")


Das DataFrame wurde erfolgreich als 'df_movie_for_streamlit.csv' gespeichert.


# Wie geht es weiter?

Öffne das "data_prep_filter.ipynb" im Ordner "App_features" > "movie_filter"