# Find IT

## Import Libraries

In [17]:
import pandas as pd
import numpy as np

# Import other libraries if needed

## Import Dataset & Validation Set

In [18]:
df = pd.read_csv('train.csv')
df.head()
trainCopys = df.copy()

In [19]:
sumMissingValues = trainCopy.isnull().sum()
sumMissingValues

developerCountry                   5107
countryCode                          64
userRatingCount                       0
primaryGenreName                      0
downloads                          2149
deviceType                            0
hasPrivacyLink                      750
hasTermsOfServiceLink              4635
hasTermsOfServiceLinkRating        4635
isCorporateEmailScore              1128
adSpent                            5679
appAge                               50
averageUserRating                  1232
appContentBrandSafetyRating        6162
appDescriptionBrandSafetyRating       0
mfaRating                             0
coppaRisk                             0
dtype: int64

In [20]:
dg = pd.read_csv('target.csv')
dg.head()
targetCopys = dg.copy()
targetCopys

Unnamed: 0,coppaRisk
0,False
1,False
2,False
3,False
4,False
...,...
6995,False
6996,False
6997,False
6998,False


In [None]:
trainCopys ["coppaRisk"] = targetCopys ["coppaRisk"]
trainCopys

Unnamed: 0,developerCountry,countryCode,userRatingCount,primaryGenreName,downloads,deviceType,hasPrivacyLink,hasTermsOfServiceLink,hasTermsOfServiceLinkRating,isCorporateEmailScore,adSpent,appAge,averageUserRating,appContentBrandSafetyRating,appDescriptionBrandSafetyRating,mfaRating,coppaRisk
0,NORWAY,RO,127731,Sports,,smartphone,True,True,low,99.0,14.017220,160.400000,4.0,medium,low,low,False
1,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Medical,50 - 100,GLOBAL,True,,,99.0,,17.500000,0.0,,low,low,False
2,UNITED ARAB EMIRATES,CZ,51143,Games,50000000 - 100000000,GLOBAL,True,True,low,0.0,31.883163,30.766667,4.0,,low,low,False
3,GERMANY,GLOBAL,1074,Games,,GLOBAL,True,,,99.0,,71.533333,4.0,,low,low,False
4,CANNOT IDENTIFY COUNTRY,GLOBAL,17,Tools,1000 - 5000,GLOBAL,True,,,99.0,,52.400000,4.0,,low,low,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Utilities,,GLOBAL,True,,,99.0,,26.266667,0.0,,low,low,False
6996,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Business,,GLOBAL,True,,,,,23.800000,0.0,,low,low,False
6997,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Personalization,10 - 50,GLOBAL,True,,,0.0,,27.500000,,,medium,low,False
6998,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Business,10 - 50,GLOBAL,True,False,high,99.0,,124.033333,0.0,,low,low,False


In [None]:
from sklearn.model_selection import train_test_split 

train_set, val_set = train_test_split(trainCopys, test_size=0.3, random_state=27)
train_set


from sklearn.base import BaseEstimator, TransformerMixin

class PlaceholderCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, placeholders=None, columns=None):
        self.placeholders = placeholders or ["ADDRESS NOT LISTED IN PLAYSTORE", "CANNOT IDENTIFY COUNTRY"]
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.columns:
            for col in self.columns:
                X_copy[col] = X_copy[col].replace(self.placeholders, pd.NA)
        else:
            X_copy = X_copy.replace(self.placeholders, pd.NA)
        return X_copy

Unnamed: 0,developerCountry,countryCode,userRatingCount,primaryGenreName,downloads,deviceType,hasPrivacyLink,hasTermsOfServiceLink,hasTermsOfServiceLinkRating,isCorporateEmailScore,adSpent,appAge,averageUserRating,appContentBrandSafetyRating,appDescriptionBrandSafetyRating,mfaRating,coppaRisk
2117,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Utilities,,GLOBAL,True,,,,,32.366667,0.0,,low,low,False
5951,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,64,Communication,10000 - 50000,GLOBAL,True,,,99.0,,32.833333,3.0,,low,low,False
4108,CANNOT IDENTIFY COUNTRY,GLOBAL,0,Travel & Local,100 - 500,GLOBAL,True,,,99.0,,36.366667,0.0,,low,low,False
1781,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,1,Education,,GLOBAL,,,,,,148.466667,1.0,,low,low,False
6489,CANNOT IDENTIFY COUNTRY,GLOBAL,0,Education,5 - 10,GLOBAL,True,,,99.0,,13.000000,0.0,,low,low,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4848,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,9,Shopping,100 - 500,GLOBAL,True,,,99.0,,54.433333,5.0,,low,low,False
6687,UNITED KINGDOM,RU,724,Games,,GLOBAL,True,True,low,99.0,10.116664,16.100000,4.0,,medium,low,False
3912,CANNOT IDENTIFY COUNTRY,GLOBAL,25,Shopping,1000 - 5000,GLOBAL,True,False,high,0.0,,71.833333,4.0,,low,low,False
3768,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Navigation,,GLOBAL,True,,,0.0,,16.033333,0.0,,high,low,False


## Handling Missing Data

Missing data can adversely affect the performance and accuracy of machine learning models. There are several strategies to handle missing data in machine learning:

1. **Data Imputation:**

    a. **Mean, Median, or Mode Imputation:** For numerical features, you can replace missing values with the mean, median, or mode of the non-missing values in the same feature. This method is simple and often effective when data is missing at random.

    b. **Constant Value Imputation:** You can replace missing values with a predefined constant value (e.g., 0) if it makes sense for your dataset and problem.

    c. **Imputation Using Predictive Models:** More advanced techniques involve using predictive models to estimate missing values. For example, you can train a regression model to predict missing numerical values or a classification model to predict missing categorical values.

2. **Deletion of Missing Data:**

    a. **Listwise Deletion:** In cases where the amount of missing data is relatively small, you can simply remove rows with missing values from your dataset. However, this approach can lead to a loss of valuable information.

    b. **Column (Feature) Deletion:** If a feature has a large number of missing values and is not critical for your analysis, you can consider removing that feature altogether.

3. **Domain-Specific Strategies:**

    a. **Domain Knowledge:** In some cases, domain knowledge can guide the imputation process. For example, if you know that missing values are related to a specific condition, you can impute them accordingly.

4. **Imputation Libraries:**

    a. **Scikit-Learn:** Scikit-Learn provides a `SimpleImputer` class that can handle basic imputation strategies like mean, median, and mode imputation.

    b. **Fancyimpute:** Fancyimpute is a Python library that offers more advanced imputation techniques, including matrix factorization, k-nearest neighbors, and deep learning-based methods.

The choice of imputation method should be guided by the nature of your data, the amount of missing data, the problem you are trying to solve, and the assumptions you are willing to make.

In [None]:
# placeholders = ["ADDRESS NOT LISTED IN PLAYSTORE", "CANNOT IDENTIFY COUNTRY", ""]
# trainCopy.replace(placeholders, pd.NA, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split 
# train_set, val_set = ...

Copy = df.copy()

train_set, val_set = train_test_split(Copy, test_size=0.3, random_state=27)
x_train = train_set.loc[:, 'id':'NoOfExternalRef']
y_train = train_set['label']
x_val = val_set.loc[:, 'id':'NoOfExternalRef']
y_val = val_set['label']
train_set