# Preparación de datasets

In [39]:
# importacion general de librerias
import pandas as pd

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [40]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [41]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [42]:
#https://drive.google.com/file/d/1y4_n4twjE4VSLSC-V7QsdPJ5JR9fiVgG/view?usp=sharing GooglePlayStore.csv

id='1y4_n4twjE4VSLSC-V7QsdPJ5JR9fiVgG'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('GooglePlayStore.csv')

Se obtuvieron únicamente las columnas necesarias para la resolución de los problemas en cuestión.

In [43]:
appDetails=pd.read_csv('GooglePlayStore.csv', usecols=['App','Size','Genres'])

In [44]:
appDetails

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical
10839,The SCP Foundation DB fr nn5n,Varies with device,Books & Reference


In [45]:
appDetails.duplicated().value_counts()

False    9680
True     1161
dtype: int64

Se droppearon las aplicaciones con el mismo valor en todas las columnas, dado que podrían existir aplicaciones con el mismo nombre.

In [46]:
validAppDetails = appDetails.drop_duplicates()
validAppDetails

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical
10839,The SCP Foundation DB fr nn5n,Varies with device,Books & Reference


Las aplicaciones sin nombre no son válidas:

In [47]:
validAppDetails = validAppDetails[appDetails['App'].notna()]
validAppDetails

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical
10839,The SCP Foundation DB fr nn5n,Varies with device,Books & Reference


In [48]:
validAppDetails.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9680 entries, 0 to 10840
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     9680 non-null   object
 1   Size    9680 non-null   object
 2   Genres  9680 non-null   object
dtypes: object(3)
memory usage: 302.5+ KB


In [49]:
validAppDetails[validAppDetails['Size']=='1,000+']

Unnamed: 0,App,Size,Genres
10472,Life Made WI-Fi Touchscreen Photo Frame,"1,000+","February 11, 2018"


In [50]:
validAppDetails = validAppDetails.drop(10472)
validAppDetails

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical
10839,The SCP Foundation DB fr nn5n,Varies with device,Books & Reference


Cambiamos el datatype de algunas columnas para mejorar el rendimiento.

In [51]:
validAppDetails['App']=validAppDetails['App'].astype('string')
validAppDetails.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9679 entries, 0 to 10840
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     9679 non-null   string
 1   Size    9679 non-null   object
 2   Genres  9679 non-null   object
dtypes: object(2), string(1)
memory usage: 302.5+ KB


# Ejercicio 40
Queremos saber cuánto pesaría si quisiéramos bajar todas las apps de un género, para todos los géneros. Para eso se pide: Calcular separado por géneros, cuanto pesarian todas las apps que tienen ese género (Tener en cuenta que si una app tiene acción y arte, su peso cuenta para ambos géneros) (⭐⭐)

In [52]:
appSizeAndGenres = validAppDetails.loc[:, ['App','Size','Genres']]
appSizeAndGenres

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical
10839,The SCP Foundation DB fr nn5n,Varies with device,Books & Reference


In [53]:
appPreciseSizeAndGenres = validAppDetails[validAppDetails['Size']!='Varies with device']
appPreciseSizeAndGenres

Unnamed: 0,App,Size,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design
1,Coloring book moana,14M,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design
3,Sketch - Draw & Paint,25M,Art & Design
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity
...,...,...,...
10835,FR Forms,9.6M,Business
10836,Sya9a Maroc - FR,53M,Education
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education
10838,Parkinson Exercices FR,9.5M,Medical


Vamos a truncar los valores menores a 1 KB, dado que están lejos de formar parte de cifras significativas

In [54]:
def convertToFloatInKB(size):
  if size[-1] == 'G':
    return int(float(size.strip('G'))*1048576)
  if size[-1] == 'M':
    return int(float(size.strip('M'))*1024)
  if size[-1] == 'k':
    return int(float(size.strip('k')))

In [55]:
appPreciseSizeAndGenres['Size in KB'] = appPreciseSizeAndGenres['Size'].apply(convertToFloatInKB)
appPreciseSizeAndGenres

Unnamed: 0,App,Size,Genres,Size in KB
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,Art & Design,19456
1,Coloring book moana,14M,Art & Design;Pretend Play,14336
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,Art & Design,8908
3,Sketch - Draw & Paint,25M,Art & Design,25600
4,Pixel Draw - Number Art Coloring Book,2.8M,Art & Design;Creativity,2867
...,...,...,...,...
10835,FR Forms,9.6M,Business,9830
10836,Sya9a Maroc - FR,53M,Education,54272
10837,Fr. Mike Schmitz Audio Teachings,3.6M,Education,3686
10838,Parkinson Exercices FR,9.5M,Medical,9728


In [56]:
appSizeInKBAndGenres = appPreciseSizeAndGenres.loc[:,['App','Genres','Size in KB']]
appSizeInKBAndGenres

Unnamed: 0,App,Genres,Size in KB
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19456
1,Coloring book moana,Art & Design;Pretend Play,14336
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8908
3,Sketch - Draw & Paint,Art & Design,25600
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2867
...,...,...,...
10835,FR Forms,Business,9830
10836,Sya9a Maroc - FR,Education,54272
10837,Fr. Mike Schmitz Audio Teachings,Education,3686
10838,Parkinson Exercices FR,Medical,9728


In [57]:
appSizeInKBAndGenres.Genres.unique()

array(['Art & Design', 'Art & Design;Pretend Play',
       'Art & Design;Creativity', 'Art & Design;Action & Adventure',
       'Auto & Vehicles', 'Beauty', 'Books & Reference', 'Business',
       'Comics', 'Comics;Creativity', 'Communication', 'Dating',
       'Education', 'Education;Creativity', 'Education;Education',
       'Education;Action & Adventure', 'Education;Pretend Play',
       'Education;Brain Games', 'Entertainment',
       'Entertainment;Brain Games', 'Entertainment;Music & Video',
       'Events', 'Finance', 'Food & Drink', 'Health & Fitness',
       'House & Home', 'Libraries & Demo', 'Lifestyle',
       'Lifestyle;Pretend Play', 'Adventure;Action & Adventure', 'Arcade',
       'Casual', 'Card', 'Casual;Pretend Play', 'Strategy', 'Action',
       'Puzzle', 'Sports', 'Word', 'Racing', 'Casual;Creativity',
       'Simulation', 'Adventure', 'Board', 'Trivia', 'Role Playing',
       'Simulation;Education', 'Action;Action & Adventure',
       'Casual;Brain Games', 'Simulat

In [58]:
appSizeInKBAndGenres.Genres.map(lambda genres: [ genre for genre in genres.split(";")])

0                      [Art & Design]
1        [Art & Design, Pretend Play]
2                      [Art & Design]
3                      [Art & Design]
4          [Art & Design, Creativity]
                     ...             
10835                      [Business]
10836                     [Education]
10837                     [Education]
10838                       [Medical]
10840                     [Lifestyle]
Name: Genres, Length: 8450, dtype: object

In [59]:
appSizeInKBAndGenres['Genres List'] = appSizeInKBAndGenres.Genres.map(lambda genres: [ genre for genre in genres.split(";")])
appSizeInKBAndGenres

Unnamed: 0,App,Genres,Size in KB,Genres List
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19456,[Art & Design]
1,Coloring book moana,Art & Design;Pretend Play,14336,"[Art & Design, Pretend Play]"
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8908,[Art & Design]
3,Sketch - Draw & Paint,Art & Design,25600,[Art & Design]
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2867,"[Art & Design, Creativity]"
...,...,...,...,...
10835,FR Forms,Business,9830,[Business]
10836,Sya9a Maroc - FR,Education,54272,[Education]
10837,Fr. Mike Schmitz Audio Teachings,Education,3686,[Education]
10838,Parkinson Exercices FR,Medical,9728,[Medical]


In [60]:
appWithDuplicatesForItsGenre = appSizeInKBAndGenres.explode('Genres List')
appWithDuplicatesForItsGenre.rename(columns = {'Genres List':'Genre'}, inplace = True)
appWithDuplicatesForItsGenre['Genre'] = appWithDuplicatesForItsGenre['Genre'].astype('category')
appWithDuplicatesForItsGenre = appWithDuplicatesForItsGenre.loc[:,['App','Genre','Size in KB']]
appWithDuplicatesForItsGenre

Unnamed: 0,App,Genre,Size in KB
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19456
1,Coloring book moana,Art & Design,14336
1,Coloring book moana,Pretend Play,14336
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8908
3,Sketch - Draw & Paint,Art & Design,25600
...,...,...,...
10835,FR Forms,Business,9830
10836,Sya9a Maroc - FR,Education,54272
10837,Fr. Mike Schmitz Audio Teachings,Education,3686
10838,Parkinson Exercices FR,Medical,9728


Cuánto pesarian en MB todas las apps que pertenecen a cada género:

In [61]:
appWithDuplicatesForItsGenre.groupby('Genre')['Size in KB'].sum()/1024

Genre
Action                               14,237.32
Action & Adventure                    3,804.10
Adventure                             2,698.39
Arcade                                6,961.79
Art & Design                            766.98
Auto & Vehicles                       1,502.75
Beauty                                  648.39
Board                                 1,363.77
Books & Reference                     2,665.96
Brain Games                           1,876.70
Business                              5,213.87
Card                                  1,348.51
Casino                                1,059.70
Casual                                6,532.99
Comics                                  675.92
Communication                         2,746.08
Creativity                            1,065.90
Dating                                2,490.09
Education                            12,408.52
Educational                           4,422.67
Entertainment                         8,310.78
Events 