# Cleans `continents2.csv`

In [1]:
# imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

sns.set()
%matplotlib inline

In [2]:
# loading the data
df = pd.read_csv('data/continents2.csv')
df.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [4]:
# Checking if they are any duplicates in the data, and length of dataframe
df.duplicated().sum(), len(df)

(0, 249)

In [6]:
df.columns


Index(['name', 'alpha-2', 'alpha-3', 'country-code', 'iso_3166-2', 'region',
       'sub-region', 'intermediate-region', 'region-code', 'sub-region-code',
       'intermediate-region-code'],
      dtype='object')

In [8]:
# Selecting the columns we think would be useful for our model
columns = ['name','region','sub-region']
df = df[columns]
df.head()

Unnamed: 0,name,region,sub-region
0,Afghanistan,Asia,Southern Asia
1,Åland Islands,Europe,Northern Europe
2,Albania,Europe,Southern Europe
3,Algeria,Africa,Northern Africa
4,American Samoa,Oceania,Polynesia


In [9]:
# Checking for nulls
df.isnull().sum()

name          0
region        1
sub-region    1
dtype: int64

In [11]:
# Checking what null value is
df1 = df[df.isna().any(axis=1)]
df1

Unnamed: 0,name,region,sub-region
8,Antarctica,,


In [12]:
# Since pupluation is so low in antarctica we can drop it since it wont affect our model 
df.dropna(inplace=True)
df.isnull().sum()

name          0
region        0
sub-region    0
dtype: int64

In [14]:
df.to_csv('data\continents2.csv', index=False)

### Summary:
Just selecting that columns that we as a group thought that would be useful for our model. Ensuring that those columns did not have any missing data. Dropped the only row that had missing data due to the population of that place being so low we did not think our model would be affected by that missing data.