### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import os

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None, 'display.max_rows', 100)


import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('whitegrid')
sns.set_palette('Blues_r')

from sklearn.preprocessing import StandardScaler
from scipy import stats

import random
import time
from datetime import datetime

import warnings
# warnings.filterwarnings('ignore')

In [21]:
# Reading the CSV file.
data= pd.read_csv('weather.csv')


In [22]:
df = data.copy()
df.shape

(3572328, 59)

In [25]:
drop_col = []
for col in df.columns:
    if df[col].nunique()<2:
        drop_col.append(col)
print(drop_col)

['M_PACKET_FORMAT', 'M_GAME_MAJOR_VERSION', 'M_PACKET_VERSION', 'M_PACKET_ID', 'M_SECONDARY_PLAYER_CAR_INDEX', 'M_SLI_PRO_NATIVE_SUPPORT', 'M_SAFETY_CAR_STATUS', 'Unnamed: 58']


In [26]:
for col in df.columns:
    if df[col].isnull().sum() > 0.3*df.shape[0]:
        drop_col.append(col)
print(drop_col)

['M_PACKET_FORMAT', 'M_GAME_MAJOR_VERSION', 'M_PACKET_VERSION', 'M_PACKET_ID', 'M_SECONDARY_PLAYER_CAR_INDEX', 'M_SLI_PRO_NATIVE_SUPPORT', 'M_SAFETY_CAR_STATUS', 'Unnamed: 58', 'M_ZONE_START', 'M_ZONE_FLAG', 'Unnamed: 58']


In [27]:
drop_row =[]
for col in df.columns:
    if df[col].isnull().sum() == 974274:
        drop_row.append(col)
print(drop_row)

['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE', 'M_TIME_OFFSET', 'M_WEATHER_FORECAST_SAMPLES_M_WEATHER', 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE', 'M_TRACK_TEMPERATURE_CHANGE', 'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE', 'M_AIR_TEMPERATURE_CHANGE', 'M_RAIN_PERCENTAGE']


In [28]:
fill_col = []
for col in df.columns:
    if df[col].isnull().sum() in range(1, int(0.01*df.shape[0])):
        fill_col.append(col)
print(fill_col)

['M_NETWORK_GAME', 'M_TOTAL_LAPS', 'M_STEERING_ASSIST', 'M_IS_SPECTATING', 'M_DYNAMIC_RACING_LINE', 'M_DRSASSIST', 'M_NUM_MARSHAL_ZONES']


In [35]:
df.drop(drop_col, axis=1, inplace=True)
df['GAMEHOST'].fillna('Unknown', inplace=True)
for col in fill_col:
    df[col].fillna(df[col].mode(), inplace=True)
df.dropna(inplace=True)
df.isnull().sum()

KeyError: "['M_PACKET_FORMAT' 'M_GAME_MAJOR_VERSION' 'M_PACKET_VERSION' 'M_PACKET_ID'\n 'M_SECONDARY_PLAYER_CAR_INDEX' 'M_SLI_PRO_NATIVE_SUPPORT'\n 'M_SAFETY_CAR_STATUS' 'Unnamed: 58' 'M_ZONE_START' 'M_ZONE_FLAG'\n 'Unnamed: 58'] not found in axis"

In [33]:
df.shape

(2598053, 49)

In [34]:
df.to_csv('clean_data.csv', encoding='utf-8', index=False)