# Preprocessed Data Cleaning

In [1]:
import pandas as pd

In [2]:
df_preprocessed = pd.read_csv("../data_preprocess/pp_data.csv")

### Observation

In [3]:
# df_preprocessed.shape
# df_preprocessed.dtypes
df_preprocessed.head()

Unnamed: 0,Name,Create_Time,Last_Modified,private,gated,Disabled,Downloads,likes,Library_Name,Tags,...,ONEHOT_autotrain_compatible,ONEHOT_safetensors,ONEHOT_tensorboard,ONEHOT_has_space,NB_en,NB_fr,NB_zh,NB_ar,NB_de,NB_es
0,albert/albert-base-v1,2022-03-02 23:29:04+00:00,,False,,,74335,4,transformers,"transformers, pytorch, tf, safetensors, albert...",...,1,1,0,0,1,0,0,0,0,0
1,albert/albert-base-v2,2022-03-02 23:29:04+00:00,,False,,,2855580,75,transformers,"transformers, pytorch, tf, jax, rust, safetens...",...,1,1,0,0,1,0,0,0,0,0
2,albert/albert-large-v1,2022-03-02 23:29:04+00:00,,False,,,1491,0,transformers,"transformers, pytorch, tf, albert, fill-mask, ...",...,1,0,0,0,1,0,0,0,0,0
3,albert/albert-large-v2,2022-03-02 23:29:04+00:00,,False,,,6875,12,transformers,"transformers, pytorch, tf, safetensors, albert...",...,1,1,0,0,1,0,0,0,0,0
4,albert/albert-xlarge-v1,2022-03-02 23:29:04+00:00,,False,,,1449,1,transformers,"transformers, pytorch, tf, safetensors, albert...",...,1,1,0,0,1,0,0,0,0,0


In [4]:
def get_missing_data(df):
    missing_values = df_preprocessed.isna().mean() * 100
    return missing_values.sort_values(ascending = False)

missing_values = get_missing_data(df_preprocessed)
print("Missing values in percentage (%)")
missing_values

Missing values in percentage (%)


Spaces                         100.000000
Widget_data                    100.000000
Last_Modified                  100.000000
gated                          100.000000
Disabled                       100.000000
siblings                       100.000000
transformers_info              100.000000
config                         100.000000
model_index                    100.000000
mask_token                     100.000000
Card_data                      100.000000
size                            99.033795
techs                           91.192324
base_models                     44.300060
transformers                    36.941692
Pipeline_tag                     8.617566
tasks                            4.321084
newtags                          1.695332
tags                             1.695332
Library_Name                     0.201293
NB_techs                         0.000000
NB_Base_models                   0.000000
langs                            0.000000
NB_Lang                          0

Drop columns with more than 50% missing values given that they are not crucial for the analysis:

In [5]:
threshold = 50
columns_to_drop = missing_values[missing_values > threshold].index
df_preprocessed.drop(columns=columns_to_drop, inplace=True)
df_preprocessed.columns

Index(['Name', 'Create_Time', 'private', 'Downloads', 'likes', 'Library_Name',
       'Tags', 'Pipeline_tag', 'nb_base_model', 'nb_license', 'nb_adapterhub',
       'nb_template', 'nb_inference', 'nb_region', 'nb_dataset', 'nb_arxiv',
       'nb_pipeline', 'nb_diffusers', 'nb_doi', 'tags', 'framework_torch',
       'framework_jax', 'framework_onnx', 'framework_tensorflow',
       'framework_keras', 'newtags', 'base_models', 'NB_Base_models', 'langs',
       'NB_Lang', 'tasks', 'NB_tasks', 'transformers', 'NB_techs',
       'ONEHOT_endpoints_compatible', 'ONEHOT_autotrain_compatible',
       'ONEHOT_safetensors', 'ONEHOT_tensorboard', 'ONEHOT_has_space', 'NB_en',
       'NB_fr', 'NB_zh', 'NB_ar', 'NB_de', 'NB_es'],
      dtype='object')

Rename / uniformize columns names

In [6]:
df_preprocessed.rename(columns={'Name': 'name', 'Create_Time': 'created_at', 'Downloads': 'downloads', 'Library_Name': 'library_name', 'Pipeline_tag': 'pipeline_tag'}, inplace = True)
df_preprocessed.columns

Index(['name', 'created_at', 'private', 'downloads', 'likes', 'library_name',
       'Tags', 'pipeline_tag', 'nb_base_model', 'nb_license', 'nb_adapterhub',
       'nb_template', 'nb_inference', 'nb_region', 'nb_dataset', 'nb_arxiv',
       'nb_pipeline', 'nb_diffusers', 'nb_doi', 'tags', 'framework_torch',
       'framework_jax', 'framework_onnx', 'framework_tensorflow',
       'framework_keras', 'newtags', 'base_models', 'NB_Base_models', 'langs',
       'NB_Lang', 'tasks', 'NB_tasks', 'transformers', 'NB_techs',
       'ONEHOT_endpoints_compatible', 'ONEHOT_autotrain_compatible',
       'ONEHOT_safetensors', 'ONEHOT_tensorboard', 'ONEHOT_has_space', 'NB_en',
       'NB_fr', 'NB_zh', 'NB_ar', 'NB_de', 'NB_es'],
      dtype='object')

Format columns names

In [7]:
extracted_name = df_preprocessed['name'].str.extract("([a-zA-Z0-9]+)[/](.*)")
extracted_name

Unnamed: 0,0,1
0,albert,albert-base-v1
1,albert,albert-base-v2
2,albert,albert-large-v1
3,albert,albert-large-v2
4,albert,albert-xlarge-v1
...,...,...
44706,mmervecerit,vit-base-beans-tutorial
44707,jsfs11,WildMBXMarconi-SLERP-7B
44708,PetroGPT,WestSeverus-7B-DPO
44709,tinywell,ppo-Huggy


In [8]:
df_preprocessed['model_name'] = extracted_name[0].astype(str)
df_preprocessed['model_version'] = extracted_name[1].astype(str)

# print(df_preprocessed['model_name'].isna().sum(), ',', df_preprocessed['model_version'].isna().sum())
df_preprocessed.head(2)

Unnamed: 0,name,created_at,private,downloads,likes,library_name,Tags,pipeline_tag,nb_base_model,nb_license,...,ONEHOT_tensorboard,ONEHOT_has_space,NB_en,NB_fr,NB_zh,NB_ar,NB_de,NB_es,model_name,model_version
0,albert/albert-base-v1,2022-03-02 23:29:04+00:00,False,74335,4,transformers,"transformers, pytorch, tf, safetensors, albert...",fill-mask,0,1,...,0,0,1,0,0,0,0,0,albert,albert-base-v1
1,albert/albert-base-v2,2022-03-02 23:29:04+00:00,False,2855580,75,transformers,"transformers, pytorch, tf, jax, rust, safetens...",fill-mask,0,1,...,0,0,1,0,0,0,0,0,albert,albert-base-v2


In [9]:
# df_preprocessed.duplicated().sum()
df_preprocessed['created_at'] = pd.to_datetime(df_preprocessed['created_at'])
df_preprocessed['created_at']

0       2022-03-02 23:29:04+00:00
1       2022-03-02 23:29:04+00:00
2       2022-03-02 23:29:04+00:00
3       2022-03-02 23:29:04+00:00
4       2022-03-02 23:29:04+00:00
                   ...           
44706   2024-01-24 01:18:55+00:00
44707   2024-01-24 02:14:47+00:00
44708   2024-01-24 02:26:55+00:00
44709   2024-01-24 02:32:23+00:00
44710   2024-01-24 03:09:22+00:00
Name: created_at, Length: 44711, dtype: datetime64[ns, UTC]

In [10]:
# df_preprocessed['private'].unique()
# df_preprocessed['private'].value_counts()
# df_preprocessed.drop(columns='private', inplace=True)

# df_preprocessed['downloads']
# df_preprocessed['likes']
# df_preprocessed["library_name"]
# df_preprocessed[["name", "library_name"]]

df_preprocessed.columns

Index(['name', 'created_at', 'private', 'downloads', 'likes', 'library_name',
       'Tags', 'pipeline_tag', 'nb_base_model', 'nb_license', 'nb_adapterhub',
       'nb_template', 'nb_inference', 'nb_region', 'nb_dataset', 'nb_arxiv',
       'nb_pipeline', 'nb_diffusers', 'nb_doi', 'tags', 'framework_torch',
       'framework_jax', 'framework_onnx', 'framework_tensorflow',
       'framework_keras', 'newtags', 'base_models', 'NB_Base_models', 'langs',
       'NB_Lang', 'tasks', 'NB_tasks', 'transformers', 'NB_techs',
       'ONEHOT_endpoints_compatible', 'ONEHOT_autotrain_compatible',
       'ONEHOT_safetensors', 'ONEHOT_tensorboard', 'ONEHOT_has_space', 'NB_en',
       'NB_fr', 'NB_zh', 'NB_ar', 'NB_de', 'NB_es', 'model_name',
       'model_version'],
      dtype='object')

### cleanup Tags, tags, pipeline_tag

Delete library_name, tags, pipeline_tag and newtags as all the informations can be found in the Tags column

In [11]:
# df_preprocessed['tags'].isna().sum()
# df_preprocessed['tags'].iloc[0].isna().sum()
# df_preprocessed['tags'].iloc[1].isna().sum()

# df_preprocessed['Tags'].iloc[0]
# df_preprocessed.iloc[0]
# df_preprocessed['tags'].unique()

# df.shape
# df_preprocessed.dtypes.head(45)
df_preprocessed.drop(columns=['tags','pipeline_tag','newtags'], inplace=True)
df_preprocessed.columns

Index(['name', 'created_at', 'private', 'downloads', 'likes', 'library_name',
       'Tags', 'nb_base_model', 'nb_license', 'nb_adapterhub', 'nb_template',
       'nb_inference', 'nb_region', 'nb_dataset', 'nb_arxiv', 'nb_pipeline',
       'nb_diffusers', 'nb_doi', 'framework_torch', 'framework_jax',
       'framework_onnx', 'framework_tensorflow', 'framework_keras',
       'base_models', 'NB_Base_models', 'langs', 'NB_Lang', 'tasks',
       'NB_tasks', 'transformers', 'NB_techs', 'ONEHOT_endpoints_compatible',
       'ONEHOT_autotrain_compatible', 'ONEHOT_safetensors',
       'ONEHOT_tensorboard', 'ONEHOT_has_space', 'NB_en', 'NB_fr', 'NB_zh',
       'NB_ar', 'NB_de', 'NB_es', 'model_name', 'model_version'],
      dtype='object')

### Modifing how name is printed
- Delete df_preprocessed['name'] (given that it can be obtained by using model_name and model_version)
- Moving model_name and model_version as the first and second columns

In [12]:
df_preprocessed.drop(columns='name', inplace=True)
def move_columns(df, column_to_move, desired_position):
    column_to_move_series = df.pop(column_to_move)
    df.insert(desired_position, column_to_move, column_to_move_series)

move_columns(df_preprocessed, 'model_name', 0)
move_columns(df_preprocessed, 'model_version', 1)

df_preprocessed.head(2)

Unnamed: 0,model_name,model_version,created_at,private,downloads,likes,library_name,Tags,nb_base_model,nb_license,...,ONEHOT_autotrain_compatible,ONEHOT_safetensors,ONEHOT_tensorboard,ONEHOT_has_space,NB_en,NB_fr,NB_zh,NB_ar,NB_de,NB_es
0,albert,albert-base-v1,2022-03-02 23:29:04+00:00,False,74335,4,transformers,"transformers, pytorch, tf, safetensors, albert...",0,1,...,1,1,0,0,1,0,0,0,0,0
1,albert,albert-base-v2,2022-03-02 23:29:04+00:00,False,2855580,75,transformers,"transformers, pytorch, tf, jax, rust, safetens...",0,1,...,1,1,0,0,1,0,0,0,0,0


In [43]:
df_preprocessed['langs'].value_counts()

langs
,         31901
,en        6145
,zh         391
,ko         365
,ja         231
          ...  
,ko,fi        1
,ko,hu        1
,ko,ru        1
,lt,de        1
,en,te        1
Name: count, Length: 1093, dtype: int64

In [54]:
def isunknow(lang):
    if lang == ',':
        return True
    else:
        return False
    
df_preprocessed['langs'].apply(lambda x: isunknow(x)).value_counts()

langs
True     31901
False    12810
Name: count, dtype: int64

Create a list of models differents languages, instead of a all-in-one string seperated by comma 

In [72]:
# df_preprocessed['langs'].str.replace(',', ' ').str.split().apply(lambda x: [i.strip() for i in x]).value_counts().index
# df_preprocessed['langs'] =
# df_preprocessed['langs'].str.replace(',', ' ').str.split().apply(lambda x: [i.strip() for i in x]).value_counts()

Drop not crucial columns which haven't interesting information

In [128]:
# df_preprocessed['private'].value_counts() # same value for all rows
# df_preprocessed['nb_license'].value_counts() # not useful 0, 1, 2 number of licenses
# df_preprocessed['nb_adapterhub'].value_counts() # not useful (a part 7 models, all are 0)
# df_preprocessed['nb_template'].value_counts() # nearly 90% have 0 templates
# df_preprocessed['nb_inference'].value_counts() # nearly 99% have 0 inferences
# df_preprocessed['nb_region'].value_counts() # all have 1 region
# df_preprocessed['nb_arxiv'].value_counts()?
# df_preprocessed['nb_pipeline'].value_counts() # nearly 99% have 0 pipelines
# df_preprocessed['nb_diffusers'].value_counts() # nearly 99% have 0 diffusers
# df_preprocessed['nb_doi'].value_counts() # nearly 99% have 0 dois
# df_preprocessed['nb_adapterhub'].value_counts() # nearly 99% have 0 dois

df_preprocessed.drop(columns=['private', 'nb_license', 'nb_adapterhub', 'nb_template', 'nb_inference', 'nb_region', 'nb_arxiv', 'nb_pipeline', 'nb_diffusers', 'nb_doi', 'nb_adapterhub'], inplace=True)
df_preprocessed

# OK for frameworks
# OK NB_
# OK ONEHOT_safetensors_

Unnamed: 0,model_name,model_version,created_at,downloads,likes,library_name,Tags,nb_base_model,nb_dataset,framework_torch,...,ONEHOT_autotrain_compatible,ONEHOT_safetensors,ONEHOT_tensorboard,ONEHOT_has_space,NB_en,NB_fr,NB_zh,NB_ar,NB_de,NB_es
0,albert,albert-base-v1,2022-03-02 23:29:04+00:00,74335,4,transformers,"transformers, pytorch, tf, safetensors, albert...",0,2,1,...,1,1,0,0,1,0,0,0,0,0
1,albert,albert-base-v2,2022-03-02 23:29:04+00:00,2855580,75,transformers,"transformers, pytorch, tf, jax, rust, safetens...",0,2,1,...,1,1,0,0,1,0,0,0,0,0
2,albert,albert-large-v1,2022-03-02 23:29:04+00:00,1491,0,transformers,"transformers, pytorch, tf, albert, fill-mask, ...",0,2,1,...,1,0,0,0,1,0,0,0,0,0
3,albert,albert-large-v2,2022-03-02 23:29:04+00:00,6875,12,transformers,"transformers, pytorch, tf, safetensors, albert...",0,2,1,...,1,1,0,0,1,0,0,0,0,0
4,albert,albert-xlarge-v1,2022-03-02 23:29:04+00:00,1449,1,transformers,"transformers, pytorch, tf, safetensors, albert...",0,2,1,...,1,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44706,mmervecerit,vit-base-beans-tutorial,2024-01-24 01:18:55+00:00,11,0,transformers,"transformers, tensorboard, safetensors, vit, i...",1,1,0,...,1,1,1,0,0,0,0,0,0,0
44707,jsfs11,WildMBXMarconi-SLERP-7B,2024-01-24 02:14:47+00:00,18,0,transformers,"transformers, safetensors, mistral, text-gener...",2,0,0,...,1,1,0,0,0,0,0,0,0,0
44708,PetroGPT,WestSeverus-7B-DPO,2024-01-24 02:26:55+00:00,18,1,transformers,"transformers, safetensors, mistral, text-gener...",0,0,0,...,1,1,0,0,0,0,0,0,0,0
44709,tinywell,ppo-Huggy,2024-01-24 02:32:23+00:00,25,0,ml-agents,"ml-agents, tensorboard, onnx, Huggy, deep-rein...",0,0,0,...,0,0,1,0,0,0,0,0,0,0


Thinking about how we can properly sanitize df_preprocessed['Tags'] such as all it's values aren't useful

In [131]:
df_preprocessed['Tags'][0]
# df_preprocessed['Tags'].str.replace('[','').str.replace(']','').str.replace('\'','').str.split(',').apply(lambda x: [i.strip() for i in x])

'transformers, pytorch, tf, safetensors, albert, fill-mask, exbert, en, dataset:bookcorpus, dataset:wikipedia, arxiv:1909.11942, license:apache-2.0, autotrain_compatible, endpoints_compatible, region:us'

In [126]:
# df_preprocessed.to_csv("../data_cleanup/clean_data.csv", index=False)