In [1]:
import pyspark as ps
from pyspark.sql.types import *
from pyspark.sql import functions as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity ="all"

In [2]:
spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("Capstone1") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [3]:
schema = StructType([
    StructField("Year", IntegerType(), True),
    StructField("Time_Code", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Country_Code", StringType(), True),
    StructField("Males_15plus", FloatType(), True),
    StructField("poorest40_15plus", FloatType(), True),
    StructField("richest60_15plus", FloatType(), True),
    StructField("young_adult_15-24", FloatType(), True),
    StructField("Females_15plus", FloatType(), True),
    StructField("older_adults_25plus", FloatType(), True),
    StructField("primaryed_or_less_15plus", FloatType(), True),
    StructField("secondaryed_or_more_15plus", FloatType(), True),
    StructField("mobile_sub", FloatType(), True),
    StructField("mobile_sub_per100", FloatType(), True),
    StructField("ATM_per_100k", FloatType(), True) ])

In [4]:
mobile = spark.read.csv('data/World_Bank_mobile_data.csv',
                    header=True,       
                    quote='"',         
                    sep=",",          
                    schema=schema)  

In [5]:
mobile.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Time_Code: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Country_Code: string (nullable = true)
 |-- Males_15plus: float (nullable = true)
 |-- poorest40_15plus: float (nullable = true)
 |-- richest60_15plus: float (nullable = true)
 |-- young_adult_15-24: float (nullable = true)
 |-- Females_15plus: float (nullable = true)
 |-- older_adults_25plus: float (nullable = true)
 |-- primaryed_or_less_15plus: float (nullable = true)
 |-- secondaryed_or_more_15plus: float (nullable = true)
 |-- mobile_sub: float (nullable = true)
 |-- mobile_sub_per100: float (nullable = true)
 |-- ATM_per_100k: float (nullable = true)



In [6]:
mobile.createOrReplaceTempView("mobile")

In [7]:
data = pd.read_csv('data/World_Bank_mobile_data.csv')

In [8]:
data.rename(columns = {"Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+) [FX.OWN.TOTL.MA.ZS]": "Males_15plus", 
                       "Account ownership at a financial institution or with a mobile-money-service provider, poorest 40% (% of population ages 15+) [FX.OWN.TOTL.40.ZS]": "poorest40perc_15plus", 
                       "Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+) [FX.OWN.TOTL.ZS]": "pop_15plus",
                       "Account ownership at a financial institution or with a mobile-money-service provider, richest 60% (% of population ages 15+) [FX.OWN.TOTL.60.ZS]":"richest60perc_15plus",
                       "Automated teller machines (ATMs) (per 100,000 adults) [FB.ATM.TOTL.P5]":"ATM_per100K",  
                        "Account ownership at a financial institution or with a mobile-money-service provider, young adults (% of population ages 15-24) [FX.OWN.TOTL.YG.ZS]": "young_adults_15-24",
                        "Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+) [FX.OWN.TOTL.FE.ZS]": "Females_15plus",
                        "Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+) [FX.OWN.TOTL.OL.ZS]": "older_adults_25plus",
                        "Account ownership at a financial institution or with a mobile-money-service provider, primary education or less (% of population ages 15+) [FX.OWN.TOTL.PL.ZS]": "primaryed_or_less",
                        "Account ownership at a financial institution or with a mobile-money-service provider, secondary education or more (% of population ages 15+) [FX.OWN.TOTL.SO.ZS]": "secondaryed_or_more",
                        "Mobile cellular subscriptions [IT.CEL.SETS]": "mobile_subscr",
                        "Mobile cellular subscriptions (per 100 people) [IT.CEL.SETS.P2]": "mobile_subscr_per100",
                        "Country Name":"Country",
                        "Time": "Year"},
                        inplace=True)

In [9]:
data.head()

Unnamed: 0,Year,Time Code,Country,Country Code,Males_15plus,poorest40perc_15plus,pop_15plus,richest60perc_15plus,young_adults_15-24,Females_15plus,older_adults_25plus,primaryed_or_less,secondaryed_or_more,mobile_subscr,mobile_subscr_per100,ATM_per100K
0,2020,YR2020,Afghanistan,AFG,..,..,..,..,..,..,..,..,..,..,..,..
1,2020,YR2020,Albania,ALB,..,..,..,..,..,..,..,..,..,..,..,..
2,2020,YR2020,Algeria,DZA,..,..,..,..,..,..,..,..,..,..,..,..
3,2020,YR2020,American Samoa,ASM,..,..,..,..,..,..,..,..,..,..,..,..
4,2020,YR2020,Andorra,AND,..,..,..,..,..,..,..,..,..,..,..,..


In [10]:
data2 = data[['Country', 'ATM_per100K', 'Year']].copy()

In [11]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13205 entries, 0 to 13204
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Country      13200 non-null  object
 1   ATM_per100K  13200 non-null  object
 2   Year         13202 non-null  object
dtypes: object(3)
memory usage: 309.6+ KB


In [12]:
data2.agg({"ATM_per100K": lambda x: x.count()})

ATM_per100K    13200
dtype: int64

In [13]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13205 entries, 0 to 13204
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Country      13200 non-null  object
 1   ATM_per100K  13200 non-null  object
 2   Year         13202 non-null  object
dtypes: object(3)
memory usage: 309.6+ KB


In [14]:
data2 = data2[data2.ATM_per100K != '..']

In [15]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3355 entries, 265 to 13204
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Country      3350 non-null   object
 1   ATM_per100K  3350 non-null   object
 2   Year         3352 non-null   object
dtypes: object(3)
memory usage: 104.8+ KB


In [17]:
data2['ATM_per100K'] = data2['ATM_per100K'].astype(float)

In [22]:
grouped = data2.groupby('Country')
for group in grouped:
    print(group)

('Afghanistan',           Country  ATM_per100K  Year
528   Afghanistan     1.611932  2018
792   Afghanistan     1.213235  2017
1056  Afghanistan     1.059931  2016
1320  Afghanistan     0.917248  2015
1584  Afghanistan     0.749717  2014
1848  Afghanistan     0.705170  2013
2112  Afghanistan     0.636844  2012
2376  Afghanistan     0.615837  2011
2640  Afghanistan     0.528944  2010
2904  Afghanistan     0.449157  2009
3168  Afghanistan     0.299597  2008
3432  Afghanistan     0.206287  2007
3696  Afghanistan     0.116381  2006
3960  Afghanistan     0.059808  2005
4224  Afghanistan     0.015601  2004)
('Albania',       Country  ATM_per100K  Year
265   Albania    29.988454  2019
529   Albania    30.638125  2018
793   Albania    31.714076  2017
1057  Albania    34.080738  2016
1321  Albania    35.255272  2015
1585  Albania    34.750044  2014
1849  Albania    35.427446  2013
2113  Albania    35.737533  2012
2377  Albania    35.285166  2011
2641  Albania    34.131657  2010
2905  Albania   

In [93]:
data2['countrycount'] = data2.value_counts("Country")

In [51]:
data2.value_counts("Country")

Country
Latin America & the Caribbean (IDA & IBRD countries)    16
Mozambique                                              16
Maldives                                                16
Europe & Central Asia (IDA & IBRD countries)            16
Malta                                                   16
                                                        ..
United States                                            6
San Marino                                               6
Sierra Leone                                             5
Haiti                                                    5
Kiribati                                                 3
Length: 234, dtype: int64

In [77]:
country_counts = data2["Country"].value_counts()
countries = country_counts == 16 #get the boolean array for 16 entries
country_dictionary = dict(country_counts)

In [66]:
unique = data2['Country'].unique()
unique
len(unique)
len(unique[countries])

array(['Albania', 'American Samoa', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Austria', 'Azerbaijan', 'Bahamas, The',
       'Barbados', 'Belarus', 'Belize', 'Benin', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Canada',
       'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Egypt, Arab Rep.',
       'El Salvador', 'Estonia', 'Gambia, The', 'Georgia', 'Greece',
       'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana',
       'Honduras', 'Hong Kong SAR, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Kazakhstan',
       'Kenya', 'Kuwait', 'Kyrgyz Republic', 'Latvia', 'Lebanon',
       'Lithuania', 'Luxembourg', 'Macao SAR, China', 'Malaysia',
       'Maldives', 'Mali', 'Malta', 'Mau

235

IndexError: boolean index did not match indexed array along dimension 0; dimension is 235 but corresponding boolean dimension is 234

In [91]:
unique_dictionary = dict()
for country in unique:
    unique_dictionary[country] = 1

unique_dictionary

{'Albania': 1,
 'American Samoa': 1,
 'Antigua and Barbuda': 1,
 'Argentina': 1,
 'Armenia': 1,
 'Aruba': 1,
 'Austria': 1,
 'Azerbaijan': 1,
 'Bahamas, The': 1,
 'Barbados': 1,
 'Belarus': 1,
 'Belize': 1,
 'Benin': 1,
 'Bolivia': 1,
 'Bosnia and Herzegovina': 1,
 'Brazil': 1,
 'Brunei Darussalam': 1,
 'Bulgaria': 1,
 'Burkina Faso': 1,
 'Cabo Verde': 1,
 'Cambodia': 1,
 'Canada': 1,
 'Chile': 1,
 'China': 1,
 'Colombia': 1,
 'Comoros': 1,
 'Costa Rica': 1,
 "Cote d'Ivoire": 1,
 'Croatia': 1,
 'Cyprus': 1,
 'Czech Republic': 1,
 'Denmark': 1,
 'Djibouti': 1,
 'Dominica': 1,
 'Dominican Republic': 1,
 'Egypt, Arab Rep.': 1,
 'El Salvador': 1,
 'Estonia': 1,
 'Gambia, The': 1,
 'Georgia': 1,
 'Greece': 1,
 'Grenada': 1,
 'Guatemala': 1,
 'Guinea': 1,
 'Guinea-Bissau': 1,
 'Guyana': 1,
 'Honduras': 1,
 'Hong Kong SAR, China': 1,
 'Hungary': 1,
 'Iceland': 1,
 'India': 1,
 'Indonesia': 1,
 'Israel': 1,
 'Italy': 1,
 'Jamaica': 1,
 'Japan': 1,
 'Kazakhstan': 1,
 'Kenya': 1,
 'Kuwait': 1,
 

In [92]:
key = []
for k in unique_dictionary.keys():
    if k not in country_dictionary.keys():
        key.append(k)

key

[nan]

In [None]:
data2.drop