# Summary of ORG

This notebook contains a list of Python methods that were introduced in the ORG programming exercises.

### Import the libraries

In [None]:
import numpy as np
import pandas as pd
import sqlite3

## Step 1. Connect to database

In [None]:
conn_countries = sqlite3.connect('../datasets/countries.db')

## Step 2. Operate on database

In [None]:
query_all = 'SELECT * FROM countries;'
pd.read_sql_query(query_all, conn_countries)

### `WHERE` clause


In [None]:
query_area_population = '''
    SELECT name, capital
    FROM countries
    WHERE area < 1000 AND population > 200000;
'''
pd.read_sql_query(query_area_population, conn_countries)

### `GROUP BY` clause


In [None]:
query = '''
    SELECT continent, COUNT(name), SUM(population), MAX(population)
    FROM countries
    GROUP BY continent
'''
pd.read_sql_query(query, conn_countries)

### `ORDER BY` clause


In [None]:
query = '''
    SELECT name, population
    FROM countries
    ORDER BY population DESC;
'''
pd.read_sql_query(query, conn_countries)

### `LIMIT` clause


In [None]:
query = '''
    SELECT name, population
    FROM countries
    ORDER BY population DESC
    LIMIT 10;
'''
pd.read_sql_query(query, conn_countries)

### Joining tables


In [None]:
pd.read_sql_query('SELECT * FROM continents;', conn_countries)

In [None]:
query = '''
    SELECT countries.name, population, continents.name AS 'continent name'
    FROM countries, continents
    WHERE countries.continent = continents.code
    ORDER BY population DESC
    LIMIT 10;
'''
pd.read_sql_query(query, conn_countries)

### Subqueries, or nested queries


In [None]:
query_E = '''
    SELECT continent
    FROM countries
    WHERE name LIKE 'E%'
'''
pd.read_sql_query(query_E, conn_countries)

In [None]:
query_large = '''
    SELECT continent
    FROM countries
    WHERE area > 10000000
'''
pd.read_sql_query(query_large, conn_countries)

In [None]:
query_E_except_large = query_E + '''
    EXCEPT
''' + query_large
print(query_E_except_large)
pd.read_sql_query(query_E_except_large, conn_countries)

## Step 3. Close connection to database


In [None]:
conn_countries.close()

## Data cleaning

In [None]:
country_data = pd.read_csv('../datasets/countries.csv', na_values=[''], keep_default_na=False)
country_data.head()

## `df.astype()`: convert to given type

In [None]:
country_data.dtypes

In [None]:
country_data_as_str = country_data.astype(str)
country_data_as_str.dtypes

In [None]:
population_as_str = country_data.astype({'population': str})
population_as_str.dtypes

##  `pd.to_numeric()`: convert to numeric


In [None]:
pd.to_numeric(country_data_as_str['population']).head()

In [None]:
pd.to_numeric(country_data_as_str['area']).head()

## `ts.unique()`: find all unique values


In [None]:
country_data['capital'].unique()

## `df.duplicated()`: find duplicates


In [None]:
bm_dup_capital = country_data.duplicated(subset='capital')
country_data[bm_dup_capital]

In [None]:
bm_dup_capital_all = country_data.duplicated(subset='capital', keep=False)
country_data[bm_dup_capital_all]

## `df.drop_duplicates()`: remove duplicates


In [None]:
country_data.info()

In [None]:
country_data.drop_duplicates(subset='capital').info()

## `df.dropna()` to remove rows with missing values


In [None]:
country_data_dropna = country_data.dropna()
country_data_dropna.info()

## `df.fillna()` to replace missing values


In [None]:
country_data_filled = country_data['capital'].fillna(value='Unknown')
country_data_filled[country_data_filled.duplicated(keep=False)]

In [None]:
country_data_filled = country_data.fillna({'capital': 'Unknown'})
country_data_filled

## `ts.str.strip()` to strip characters from begin/end of strings


In [None]:
country_data_as_str['area'].str.strip('0')