# Preprocessed Data Cleaning

In [None]:
import pandas as pd

In [None]:
df_preprocessed = pd.read_csv("../data_preprocess/pp_data.csv")

### Observation

In [None]:
# df_preprocessed.shape
# df_preprocessed.dtypes
df_preprocessed.head()

In [None]:
def get_missing_data(df):
    missing_values = df_preprocessed.isna().mean() * 100
    return missing_values.sort_values(ascending = False)

missing_values = get_missing_data(df_preprocessed)
print("Missing values in percentage (%)")
missing_values

Drop columns with more than 50% missing values given that they are not crucial for the analysis:

In [None]:
threshold = 50
columns_to_drop = missing_values[missing_values > threshold].index
df_preprocessed.drop(columns=columns_to_drop, inplace=True)
df_preprocessed.columns

Rename / uniformize columns names

In [None]:
df_preprocessed.rename(columns={'Name': 'name', 'Create_Time': 'created_at', 'Downloads': 'downloads', 'Library_Name': 'library_name', 'Pipeline_tag': 'pipeline_tag'}, inplace = True)
df_preprocessed.columns

Format columns names

In [None]:
extracted_name = df_preprocessed['name'].str.extract("([a-zA-Z0-9]+)[/](.*)")
extracted_name

In [None]:
df_preprocessed['model_name'] = extracted_name[0].astype(str)
df_preprocessed['model_version'] = extracted_name[1].astype(str)

# print(df_preprocessed['model_name'].isna().sum(), ',', df_preprocessed['model_version'].isna().sum())
df_preprocessed.head(2)

In [None]:
# df_preprocessed.duplicated().sum()
df_preprocessed['created_at'] = pd.to_datetime(df_preprocessed['created_at'])
df_preprocessed['created_at'] = df_preprocessed['created_at'].dt.date
df_preprocessed['created_at']

In [None]:
# df_preprocessed['private'].unique()
# df_preprocessed['private'].value_counts()
# df_preprocessed.drop(columns='private', inplace=True)

# df_preprocessed['downloads']
# df_preprocessed['likes']
# df_preprocessed["library_name"]
# df_preprocessed[["name", "library_name"]]

df_preprocessed.columns

### cleanup Tags, tags, pipeline_tag

Delete library_name, tags, pipeline_tag and newtags as all the informations can be found in the Tags column

In [None]:
# df_preprocessed['tags'].isna().sum()
# df_preprocessed['tags'].iloc[0].isna().sum()
# df_preprocessed['tags'].iloc[1].isna().sum()

# df_preprocessed['Tags'].iloc[0]
# df_preprocessed.iloc[0]
# df_preprocessed['tags'].unique()

# df.shape
# df_preprocessed.dtypes.head(45)
df_preprocessed.drop(columns=['tags','pipeline_tag','newtags'], inplace=True)
df_preprocessed.columns

### Modifing how name is printed
- Delete df_preprocessed['name'] (given that it can be obtained by using model_name and model_version)
- Moving model_name and model_version as the first and second columns

In [None]:
df_preprocessed.drop(columns='name', inplace=True)
def move_columns(df, column_to_move, desired_position):
    column_to_move_series = df.pop(column_to_move)
    df.insert(desired_position, column_to_move, column_to_move_series)

move_columns(df_preprocessed, 'model_name', 0)
move_columns(df_preprocessed, 'model_version', 1)

df_preprocessed.head(2)

In [None]:
df_preprocessed['langs'].value_counts()

In [None]:
def isunknow(lang):
    if lang == ',':
        return True
    else:
        return False
    
df_preprocessed['langs'].apply(lambda x: isunknow(x)).value_counts()

Create a list of models differents languages, instead of a all-in-one string seperated by comma 

In [None]:
# df_preprocessed['langs'].str.replace(',', ' ').str.split().apply(lambda x: [i.strip() for i in x]).value_counts().index
# df_preprocessed['langs'] =
# df_preprocessed['langs'].str.replace(',', ' ').str.split().apply(lambda x: [i.strip() for i in x]).value_counts()

Drop not crucial columns which haven't interesting information

In [None]:
# df_preprocessed['private'].value_counts() # same value for all rows
# df_preprocessed['nb_license'].value_counts() # not useful 0, 1, 2 number of licenses
# df_preprocessed['nb_adapterhub'].value_counts() # not useful (a part 7 models, all are 0)
# df_preprocessed['nb_template'].value_counts() # nearly 90% have 0 templates
# df_preprocessed['nb_inference'].value_counts() # nearly 99% have 0 inferences
# df_preprocessed['nb_region'].value_counts() # all have 1 region
# df_preprocessed['nb_arxiv'].value_counts()?
# df_preprocessed['nb_pipeline'].value_counts() # nearly 99% have 0 pipelines
# df_preprocessed['nb_diffusers'].value_counts() # nearly 99% have 0 diffusers
# df_preprocessed['nb_doi'].value_counts() # nearly 99% have 0 dois
# df_preprocessed['nb_adapterhub'].value_counts() # nearly 99% have 0 dois

df_preprocessed.drop(columns=['private', 'nb_license', 'nb_adapterhub', 'nb_template', 'nb_inference', 'nb_region', 'nb_arxiv', 'nb_pipeline', 'nb_diffusers', 'nb_doi', 'nb_adapterhub'], inplace=True)
df_preprocessed

# OK for frameworks
# OK NB_
# OK ONEHOT_safetensors_

Thinking about how we can properly sanitize df_preprocessed['Tags'] such as all it's values aren't useful

In [None]:
df_preprocessed['Tags'][0]
# df_preprocessed['Tags'].str.replace('[','').str.replace(']','').str.replace('\'','').str.split(',').apply(lambda x: [i.strip() for i in x])

In [None]:
# df_preprocessed.to_csv("../data_cleanup/clean_data.csv", index=False)