# Cloud Cycle Python Assessment

In [2]:
#Show ALL outputs in cell, not only last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats

from typing import List, Set, Dict, Tuple, Optional

In [None]:
def clean_mean(sample, cutoff):
    return # the mean of the sample, sans outliers

In [None]:
def test_example():
    " Example test case "
    sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
    cutoff = 3
    assertEqual(clean_mean(sample, cutoff), 5.5)

## Challenge 1:

### Original Solution

In [None]:
def clean_mean(
    sample: List[int],
    cutoff: int
) -> float:
    """
    Function to remove outliers from list and return new mean of cleaned sample
    """
    mean_sample = sum(sample) / len(sample)
    
    print("Initial sample mean:", mean_sample)

    q1, q3 = np.percentile(sample, [25, 75])

    iqr = q3 - q1

    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)

    sample_outlier_bool = (sample > upper_bound) | (sample < lower_bound)

    sample_outlier_indices = [i for i, x in enumerate(sample_outlier_bool) if x]

    cleaned_sample = [i for j, i in enumerate(sample) if j not in sample_outlier_indices]

    mean_cleaned_sample = round(sum(cleaned_sample) / len(cleaned_sample), 1)
    print("Cleaned sample mean:", mean_cleaned_sample)
    
    return mean_cleaned_sample # the mean of the sample, sans outliers

In [None]:
sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
cutoff = 3

In [None]:
mean_cleaned_sample = clean_mean(sample, cutoff)
type(mean_cleaned_sample)

In [None]:
assert clean_mean(sample, cutoff) == 5.5

### Re-worked Solution

In [None]:
def clean_mean(
    sample: List[int],
    cutoff: int
) -> float:
    """
    Function to remove outliers from list and return new mean of cleaned sample
    """
    
    sample_df = pd.DataFrame({
    'values': sample
    })
    
    mean_sample = sample_df['values'].mean()
    print("Initial sample mean:", mean_sample)
    
    x = sample_df['values']
    sample_df['values_normalised'] = (x-min(x))/(max(x)-min(x))
    
    sample_df['z_score'] = stats.zscore(sample_df['values'])
    
    outlier_mask = (sample_df['z_score'] > cutoff) | (sample_df['z_score'] < -cutoff)
    
    cleaned_sample_df = sample_df[~(outlier_mask)]
    
    mean_cleaned_sample = cleaned_sample_df['values'].mean()
    print("Cleaned sample mean:", mean_cleaned_sample)
    
    return mean_cleaned_sample # the mean of the sample, sans outliers

In [None]:
sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
sample_2 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999]
sample_3 = [-200, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]

cutoff = 2

In [None]:
clean_mean(sample, cutoff)

In [None]:
assert clean_mean(sample, cutoff) == 5.5
assert clean_mean(sample_2, cutoff) == 5.5
assert clean_mean(sample_3, cutoff) == 5.5

## Challenge 2:

In [5]:
data = 'data/pokemon.csv'
pokemon_df = pd.read_csv(data)

In [166]:
def get_legendary_pokemon(df: pd.DataFrame) -> pd.DataFrame:
    """
    (1)
    Select legendary Pokemon only, drop the 
    `Legendary` column and reset the index.
    """
    pass

In [168]:
"""
(1)
Select legendary Pokemon only,
drop the `Legendary` column 
and reset the index.
"""

legendary_mask = pokemon_df['Legendary'] == True

legendary_pokemon_df = \
pokemon_df.loc[(legendary_mask)] \
.drop('Legendary', axis=1) \
.reset_index()

legendary_pokemon_df

'\n(1)\nSelect legendary Pokemon only,\ndrop the `Legendary` column \nand reset the index.\n'

Unnamed: 0,index,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
0,156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1
1,157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1
2,158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1
3,162,150,Mewtwo,Psychic,,680,106,110,90,154,90,130,1
4,163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6
61,796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6
62,797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6
63,798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6


In [169]:
def name_starts_with(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    """ 
    (2)
    Select all rows containing Pokemon names that start 
    with `prefix`. Only return `#`, `Name` and `Total` 
    columns and reset indices. 
    """
    pass

In [170]:
""" 
(2)
Select all rows containing Pokemon names that start with `prefix`. 

Only return `#`, `Name` and `Total` columns

and reset indices. 
"""

prefix = "Bulb"
starts_with_mask = pokemon_df['Name'].str.startswith(prefix)

name_starts_with_df = \
pokemon_df.loc[(starts_with_mask), ['#', 'Name', 'Total']]

name_starts_with_df

' \n(2)\nSelect all rows containing Pokemon names that start with `prefix`. \n\nOnly return `#`, `Name` and `Total` columns\n\nand reset indices. \n'

Unnamed: 0,#,Name,Total
0,1,Bulbasaur,318


In [171]:
def fix_camel_cased_names(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (3)
    Some of the `Name` column data in the dataframe 
    is badly formatted.

    Names in the form `LandorusIncarnate Forme` (that is, 
    with a word containing a lowercase letter immediately
    followed by an uppercase letter) should be fixed by 
    inserting a space between the letters as follows: 
    `Landorus Incarnate Forme`.

    An edge case is `Zygarde50% Forme` which should be 
    corrected to `Zygarde 50% Forme`. In other words, digits 
    immediately following a lowercase letter should be 
    considered as well as uppercase in splitting.

    The last edge case is the Pokemon named "Porygon2". 
    This is the correct name and should be maintained, so the 
    final requirement for inserting whitespace is that the rest 
    of the string must contain a non-digit character. Therefore,
    "Porygon12345" would be ignored but "Porygon12345abc"
    would become "Porygon 12345abc" since it has a non-digit 
    character after the "n12345" substring.

    Return only Pokemon with fixed names using the 
    columns `#` and `Name`.

    Finally, reset the index before returning the df.

    ====================
    A few more examples:
    ====================
    "AA"  => "AA"   (do nothing, this is normal)
    "aB"  => "a B"
    "a2"  => "a2"   (no further characters after the "2")
    "a23" => "a23"  (no further characters after the "23")
    "a2b" => "a 2b" (additional character "b" following "a2")
    """
    pass

In [92]:
# pokemon_df.head()

fix_camel_cased_names_df = pokemon_df.copy()

fix_camel_cased_names_df['Name_new'] = fix_camel_cased_names_df['Name'].str.replace(r"\B([A-Z])", r" \1")
# fix_camel_cased_names_df['Name_new'] = fix_camel_cased_names_df['Name'].str.replace(r"(\B[0-9])", r" \1")

name_diff_mask = fix_camel_cased_names_df['Name_new'] != fix_camel_cased_names_df['Name']
fix_camel_cased_names_df.loc[(name_diff_mask)]

fix_camel_cased_names_df['Name'] = fix_camel_cased_names_df['Name_new']
fix_camel_cased_names_df = fix_camel_cased_names_df[['#', 'Name']].reset_index()
# fix_camel_cased_names_df

prefix = "Z"
starts_with_mask = fix_camel_cased_names_df['Name'].str.startswith(prefix)

fix_camel_cased_names_df = \
fix_camel_cased_names_df.loc[(starts_with_mask)]

fix_camel_cased_names_df

  """


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Name_new
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,Venusaur Mega Venusaur
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False,Charizard Mega Charizard X
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False,Charizard Mega Charizard Y
12,9,BlastoiseMega Blastoise,Water,,630,79,103,120,135,115,78,1,False,Blastoise Mega Blastoise
19,15,BeedrillMega Beedrill,Bug,Poison,495,65,150,40,15,80,145,1,False,Beedrill Mega Beedrill
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,711,GourgeistLarge Size,Ghost,Grass,494,75,95,122,58,75,69,6,False,Gourgeist Large Size
787,711,GourgeistSuper Size,Ghost,Grass,494,85,100,122,58,75,54,6,False,Gourgeist Super Size
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True,Diancie Mega Diancie
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True,Hoopa Hoopa Confined


Unnamed: 0,index,#,Name
0,0,1,Bulbasaur
1,1,2,Ivysaur
2,2,3,Venusaur
3,3,3,Venusaur Mega Venusaur
4,4,4,Charmander
...,...,...,...
795,795,719,Diancie
796,796,719,Diancie Mega Diancie
797,797,720,Hoopa Hoopa Confined
798,798,720,Hoopa Hoopa Unbound


Unnamed: 0,index,#,Name
46,46,41,Zubat
157,157,145,Zapdos
286,286,263,Zigzagoon
367,367,335,Zangoose
582,582,523,Zebstrika
631,631,570,Zorua
632,632,571,Zoroark
695,695,634,Zweilous
707,707,644,Zekrom
794,794,718,Zygarde50% Forme


In [172]:
def get_most_common_type_combos(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (4)
    Find the Type 1 and Type 2 combo(s) that is/are
    most common for all Pokemon, ignoring nan. Order
    matters so `Psychic, Water` != `Water, Psychic`.
    
    Returned df should have columns `Type 1` and 
    `Type 2` and rows should be the most common. 
    """
    pass

In [173]:
""" 
(4)
Find the Type 1 and Type 2 combo(s) that is/are most common for all Pokemon, ignoring nan.

Order matters so `Psychic, Water` != `Water, Psychic`.

Returned df should have columns `Type 1` and `Type 2` and rows should be the most common. 
"""

most_common_type_combos_df = \
pokemon_df[['Type 1', 'Type 2']].dropna() \
.value_counts() \
.reset_index(name='Count')
# .iloc[[0]] \

most_common_type_combos_df

' \n(4)\nFind the Type 1 and Type 2 combo(s) that is/are most common for all Pokemon, ignoring nan.\n\nOrder matters so `Psychic, Water` != `Water, Psychic`.\n\nReturned df should have columns `Type 1` and `Type 2` and rows should be the most common. \n'

Unnamed: 0,Type 1,Type 2,Count
0,Normal,Flying,24
1,Grass,Poison,15
2,Bug,Flying,14
3,Bug,Poison,12
4,Ghost,Grass,10
...,...,...,...
131,Fire,Rock,1
132,Ice,Ghost,1
133,Fire,Dragon,1
134,Fighting,Flying,1


In [174]:
def get_most_common_legendary_pokemon_types(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (5)
    Of legendary Pokemon, return a DataFrame of the counts 
    of each type that appears in either `Type 1` or `Type 2` 
    columns ordered descending on the `Count` column and 
    ascending on the `Type` columns.
    """
    pass

In [320]:
""" 
(5)
Of legendary Pokemon,

return a DataFrame of the counts of each type that appears in either `Type 1` or `Type 2` 

columns ordered descending on the `Count` column 

and ascending on the `Type` columns.
"""

legendary_mask = pokemon_df['Legendary'] == True

most_common_legendary_pokemon_types = \
pokemon_df.loc[(legendary_mask), ['Type 1', 'Type 2']] \
.melt(value_vars=['Type 1', 'Type 2']) \
.value_counts() \
.reset_index(name='Count') \
.sort_values(['Count', 'value'], 
             ascending = [False, True])

most_common_legendary_pokemon_types

' \n(5)\nOf legendary Pokemon,\n\nreturn a DataFrame of the counts of each type that appears in either `Type 1` or `Type 2` \n\ncolumns ordered descending on the `Count` column \n\nand ascending on the `Type` columns.\n'

Unnamed: 0,variable,value,Count
0,Type 1,Psychic,14
1,Type 2,Flying,13
2,Type 1,Dragon,12
4,Type 1,Fire,5
3,Type 2,Psychic,5
6,Type 2,Dragon,4
7,Type 1,Electric,4
9,Type 2,Fighting,4
8,Type 1,Ground,4
10,Type 1,Rock,4


In [176]:
def group_by_generation_avg_strength(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (6)
    Rank the `Generation`s by average strength using 
    the `Total` column and sort descending. Counts
    are raw row counts; ignore duplicate Pokemon ids.
    
    Return the df with columns `Generation`, 
    `Mean Total` and `Count`.
    
    The final row of the returned df should be a summary
    of all generations shown in the df. The format is
    `All` (string), mean of `Mean Total` and sum of `Count`.
    """
    pass

In [188]:
""" 
(6)
Rank the `Generation`s by average strength using  the `Total` column and sort descending. 
Counts are raw row counts; ignore duplicate Pokemon ids.

Return the df with columns `Generation`, `Mean Total` and `Count`.

The final row of the returned df should be a summary of all generations shown in the df. 

The format is `All` (string), mean of `Mean Total` and sum of `Count`.
"""

generation_avg_strength_df = \
pokemon_df.drop_duplicates(subset=['#']) \
.groupby('Generation').agg({'Total': ['mean', 'count']}) \
.droplevel(level=0, axis=1) \
.sort_values('mean', ascending=False) \
.reset_index() \
.rename(columns={'mean':'Mean Total',
                 'count':'Count'})

generation_avg_strength_df.loc['All', 'Generation'] = 'All'
generation_avg_strength_df.loc['All', 'Mean Total'] = generation_avg_strength_df['Mean Total'].mean(axis=0)
generation_avg_strength_df.loc['All', 'Count'] = generation_avg_strength_df['Count'].sum(axis=0)
generation_avg_strength_df

' \n(6)\nRank the `Generation`s by average strength using  the `Total` column and sort descending. \nCounts are raw row counts; ignore duplicate Pokemon ids.\n\nReturn the df with columns `Generation`, `Mean Total` and `Count`.\n\nThe final row of the returned df should be a summary of all generations shown in the df. \n\nThe format is `All` (string), mean of `Mean Total` and sum of `Count`.\n'

Unnamed: 0,Generation,Mean Total,Count
0,4.0,445.757009,107.0
1,6.0,429.583333,72.0
2,5.0,425.307692,156.0
3,1.0,407.07947,151.0
4,2.0,406.18,100.0
5,3.0,402.059259,135.0
All,All,419.327794,721.0


In [None]:
def get_pokemon_with_unique_type1_type2_combos(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (7)
    Get combinations of `Type 1` and `Type 2` columns 
    which are unique in the entire df, respecting order 
    between the pair (in other words, a, b != b, a) and 
    ignoring all nan rows. Sort by `Name`.
    """
    pass

In [248]:
""" 
(7)
Get combinations of `Type 1` and `Type 2` columns which are unique in the entire df,

respecting order between the pair (in other words, a, b != b, a)

and ignoring all nan rows. Sort by `Name`.
"""

pokemon_with_unique_type1_type2_combos_df = pokemon_df.copy()

pokemon_with_unique_type1_type2_combos_df['unique_type_counts'] = \
pokemon_with_unique_type1_type2_combos_df \
.groupby(['Type 1', 'Type 2'])['Name'] \
.transform('count') \
.dropna() \

unique_types_mask = pokemon_with_unique_type1_type2_combos_df['unique_type_counts'] == 1

pokemon_with_unique_type1_type2_combos_df = \
pokemon_with_unique_type1_type2_combos_df.loc[(unique_types_mask)] \
.sort_values('Name')

pokemon_with_unique_type1_type2_combos_df

' \n(7)\nGet combinations of `Type 1` and `Type 2` columns which are unique in the entire df,\n\nrespecting order between the pair (in other words, a, b != b, a)\n\nand ignoring all nan rows. Sort by `Name`.\n'

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,unique_type_counts
366,334,AltariaMega Altaria,Dragon,Fairy,590,75,110,110,110,105,80,3,False,1.0
196,181,AmpharosMega Ampharos,Electric,Dragon,610,90,95,105,165,110,45,2,False,1.0
445,400,Bibarel,Normal,Water,410,79,85,60,55,60,71,4,False,1.0
271,251,Celebi,Psychic,Grass,600,100,100,100,100,100,100,2,False,1.0
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False,1.0
699,638,Cobalion,Steel,Fighting,580,91,90,129,90,72,108,5,True,1.0
772,702,Dedenne,Electric,Fairy,431,67,58,57,81,67,101,6,False,1.0
540,483,Dialga,Steel,Dragon,680,100,120,120,150,100,90,4,True,1.0
728,660,Diggersby,Normal,Ground,423,85,56,77,50,77,78,6,False,1.0
761,691,Dragalge,Poison,Dragon,494,65,75,90,97,123,44,6,False,1.0


In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

def get_legendary_pokemon(df: pd.DataFrame) -> pd.DataFrame:
    """
    (1)
    Select legendary Pokemon only, drop the 
    `Legendary` column and reset the index.
    """
    pass

def name_starts_with(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    """ 
    (2)
    Select all rows containing Pokemon names that start 
    with `prefix`. Only return `#`, `Name` and `Total` 
    columns and reset indices. 
    """
    pass
  
def fix_camel_cased_names(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (3)
    Some of the `Name` column data in the dataframe 
    is badly formatted.

    Names in the form `LandorusIncarnate Forme` (that is, 
    with a word containing a lowercase letter immediately
    followed by an uppercase letter) should be fixed by 
    inserting a space between the letters as follows: 
    `Landorus Incarnate Forme`.

    An edge case is `Zygarde50% Forme` which should be 
    corrected to `Zygarde 50% Forme`. In other words, digits 
    immediately following a lowercase letter should be 
    considered as well as uppercase in splitting.

    The last edge case is the Pokemon named "Porygon2". 
    This is the correct name and should be maintained, so the 
    final requirement for inserting whitespace is that the rest 
    of the string must contain a non-digit character. Therefore,
    "Porygon12345" would be ignored but "Porygon12345abc"
    would become "Porygon 12345abc" since it has a non-digit 
    character after the "n12345" substring.

    Return only Pokemon with fixed names using the 
    columns `#` and `Name`.

    Finally, reset the index before returning the df.

    ====================
    A few more examples:
    ====================
    "AA"  => "AA"   (do nothing, this is normal)
    "aB"  => "a B"
    "a2"  => "a2"   (no further characters after the "2")
    "a23" => "a23"  (no further characters after the "23")
    "a2b" => "a 2b" (additional character "b" following "a2")
    """
    pass

def get_most_common_type_combos(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (4)
    Find the Type 1 and Type 2 combo(s) that is/are
    most common for all Pokemon, ignoring nan. Order
    matters so `Psychic, Water` != `Water, Psychic`.
    
    Returned df should have columns `Type 1` and 
    `Type 2` and rows should be the most common. 
    """
    pass

def get_most_common_legendary_pokemon_types(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (5)
    Of legendary Pokemon, return a DataFrame of the counts 
    of each type that appears in either `Type 1` or `Type 2` 
    columns ordered descending on the `Count` column and 
    ascending on the `Type` columns.
    """
    pass

def group_by_generation_avg_strength(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (6)
    Rank the `Generation`s by average strength using 
    the `Total` column and sort descending. Counts
    are raw row counts; ignore duplicate Pokemon ids.
    
    Return the df with columns `Generation`, 
    `Mean Total` and `Count`.
    
    The final row of the returned df should be a summary
    of all generations shown in the df. The format is
    `All` (string), mean of `Mean Total` and sum of `Count`.
    """
    pass

def get_pokemon_with_unique_type1_type2_combos(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    (7)
    Get combinations of `Type 1` and `Type 2` columns 
    which are unique in the entire df, respecting order 
    between the pair (in other words, a, b != b, a) and 
    ignoring all nan rows. Sort by `Name`.
    """
    pass

In [None]:
#########
#########

In [None]:
sample_df = pd.DataFrame({
    'values': sample
})

sample_df.head(20)

In [None]:
x = sample_df['values']

sample_df['values_normalised'] = (x-min(x))/(max(x)-min(x))
sample_df.head(20)

In [None]:
sample_df['z_score'] = stats.zscore(sample_df['values'])

sample_df.head(20)

In [None]:
cutoff = 3

outlier_mask = (sample_df['z_score'] > cutoff) | (sample_df['z_score'] < -cutoff)
outlier_mask

In [None]:
cleaned_sample_df = sample_df[~(outlier_mask)]
cleaned_sample_df

In [None]:
cleaned_sample_df['values'].mean()