## 3. Data Preperation

### Agenda

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling  
import missingno as msno

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Zielverzeichnis (bitte anpassen)
TARGET_DIR = r"/Users/danielmini/Projekte/data_analyics26/data_analytics_master/2_data_acquisition/raw_data"


# CSV-Datei finden und laden
csv_files = [f for f in os.listdir(TARGET_DIR) if f.endswith(".csv")]
df = pd.read_csv(os.path.join(TARGET_DIR, csv_files[1]))

print("DataFrame geladen:", df.shape)

DataFrame geladen: (19158, 14)


In [4]:
# Generate a profile report with ydata_profiling
profile = ydata_profiling.ProfileReport(df, title="Data Profile Report")
# Save the report as an HTML file
profile.to_file("data_profile_report.html")

100%|██████████| 14/14 [00:00<00:00, 2707.50it/s]0:00, 25.78it/s, Describe variable: target]        
Summarize dataset: 100%|██████████| 33/33 [00:00<00:00, 33.73it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  6.98it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 656.69it/s]


## Fahrplan v0.1 beta

1) Erster Überblick / Metadaten und Verständnis
- Größe des Datensatzes
- Anzahl variablen
- Eigenschaften, Typen der Variablen
- Dimension
- Beispieldaten

2) Datenqualität: Missing Values, Duplikate, offensichtliche Probleme
- Missingness: Unterscheiden und analysieren ob Missing at random oder nicht
- Duplikate: echtes Duplikat 100% identisch keine Abhängigkeit von unbeobachteten Werten feststellbar --> Lösung: löschen
  - kein echtes Duplikat es fehlt eine variable die den Konflikt der Daten auflösen kann und klarheit über das Duplikat und Ursache liefert
  - Analyse der Ursachen und versuch der Lösung, letzte Möglichkeit entweder ignorieren / löschen gerade wenn der %-Anteil klein (<1%) aber Gesamtzusammenhang beachten
- offensichtliche Probleme: Ausreißer, fehlerhafte Daten, Messfehler sein

3) Datentypen bereinigen (sehr häufig nötig), bzw. Probleme der Daten lösen und anpassen ---> abhängig vom Ziel und der Problemstellung


## 1) Überblick: Ziel: Größe, Spalten, Datentypen, erste Werte.

In [7]:
df.shape

(19158, 14)

In [8]:
df.head(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [9]:
df.tail(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0
19157,23834,city_67,0.855,,No relevent experience,no_enrollment,Primary School,,2,,,1,127,0.0


In [11]:
df.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

## 2) Datenqualität: Missingness, Duplikate und weiteres

In [13]:
# Missing values
missing_abs = df.isna().sum().sort_values(ascending=False)
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)

print("Missing Values (Absolute):\n", missing_abs[missing_abs > 0])
print("\nMissing Values (Percentage):\n", missing_pct[missing_pct > 0])


Missing Values (Absolute):
 company_type           6140
company_size           5938
gender                 4508
major_discipline       2813
education_level         460
last_new_job            423
enrolled_university     386
experience               65
dtype: int64

Missing Values (Percentage):
 company_type           32.049274
company_size           30.994885
gender                 23.530640
major_discipline       14.683161
education_level         2.401086
last_new_job            2.207955
enrolled_university     2.014824
experience              0.339284
dtype: float64


In [14]:

# Duplikate
df.duplicated().sum()

np.int64(0)