# **Минипроект по Pandas**

В этом минипроекте проанализировали данные из Google Play Store. 

## **Описание данных**

App — название приложения

Category — категория, к которой относится приложение

Rating — рейтинг пользователей

Reviews — количество отзывов пользователей о приложении

Size — размер приложения

Installs — количество загрузок/установок приложения пользователями

Type — платное или бесплатное приложение

Price — цена приложения

Content Rating — возрастная группа, на которую ориентировано приложение

Genres — принадлежность приложения к нескольким жанрам

Last Updated — дата последнего обновления приложения в Play Store

Current Ver — текущая версия приложения в Play Store

Android Ver — минимальная требуемая версия Android



In [1]:
import numpy as np
import pandas as pd

In [2]:
playstore = pd.read_csv('playstore.csv')

In [3]:
#Сохраним в переменные data_head и data_tail первые и последние 3 строки из датафрейма соответственно. 
#Датафрейм уже находится в переменной playstore.

data_head = playstore.head(3)
data_head

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [4]:
data_tail = playstore.tail(3)
data_tail

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10837,10837,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10838,10838,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device
10839,10839,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


In [5]:
#Сохраним в переменные n_col и n_row количество столбцов и строк в датафрейме соответственно. 

n_col = len(playstore.columns)
n_col

14

In [6]:
n_row = len(playstore.index)
n_row

10840

In [7]:
#Теперь посмотрим, какое количество уникальных приложений есть в наших данных (колонка App).

playstore['App'].nunique()

9659

In [8]:
#Посчитаем число пропущенных значений в колонке с рейтингом приложений. 

rating_missing = len(playstore[playstore['Rating'].isna()]['Rating'])
rating_missing

1474

In [9]:
#Собираем датафрейм, который будет включать в себя: 
#    первые три строки исходного датафрейма, строки 6-8 (включительно), строки 16-19 (включительно) и колонки: 
#        App, Size, Genres, Current Ver в соответствующем порядке. Сохраняем в формате csv.

df1 = playstore.loc[0:2,['App', 'Size', 'Genres', 'Current Ver']]
df2 = playstore.loc[5:7,['App', 'Size', 'Genres', 'Current Ver']]
df3 = playstore.loc[15:18,['App', 'Size', 'Genres', 'Current Ver']]
pd.concat([df1, df2, df3]).to_csv('df.csv')

In [10]:
#Для дальнейшего анализа нам необходимо убрать дубликаты приложений (колонка App) и сбросить индекс

playstore = playstore.drop_duplicates(subset = 'App', keep='first', ignore_index=True).reset_index(drop = True)

In [11]:
#В этой ячейке названия колонок приводим к стандартному виду — все буквы должны быть нижнего регистра, 
#а пробелы должны быть заменены на нижние подчеркивания. 

playstore.columns = playstore.columns.str.replace(" ", "_").str.lower()
playstore.head()

Unnamed: 0,unnamed:_0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [12]:
#Посмотрим на долю платных и бесплатных приложений. 

playstore['type'].value_counts(normalize = True).round(2)

Free    0.92
Paid    0.08
Name: type, dtype: float64

In [13]:
#Отбираем из датафрейма только те приложения, 
#которые относятся к образовательной категории (EDUCATION) и с количеством отзывов пользователей более 1000

education_playstore = playstore[(playstore['category'] == 'EDUCATION') & (playstore['reviews'] > 1000)].reset_index(drop = True)
education_playstore

Unnamed: 0,unnamed:_0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,699,Duolingo: Learn Languages Free,EDUCATION,4.7,6289924,Varies with device,"100,000,000+",Free,0,Everyone,Education;Education,"August 1, 2018",Varies with device,Varies with device
1,700,TED,EDUCATION,4.6,181893,18M,"10,000,000+",Free,0,Everyone 10+,Education,"July 27, 2018",3.2.5,4.1 and up
2,701,English Communication - Learn English for Chin...,EDUCATION,4.7,2544,18M,"100,000+",Free,0,Everyone,Education,"December 29, 2017",3.1,4.0 and up
3,702,Khan Academy,EDUCATION,4.6,85375,21M,"5,000,000+",Free,0,Everyone,Education,"July 27, 2018",5.0.0,4.1 and up
4,703,Learn English with Wlingua,EDUCATION,4.7,314299,3.3M,"10,000,000+",Free,0,Everyone,Education,"May 2, 2018",1.94.9,4.0 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,848,SoloLearn: Learn to Code for Free,EDUCATION,4.8,256079,7.6M,"1,000,000+",Free,0,Teen,Education,"July 12, 2018",2.2.4,4.0.3 and up
94,849,Kids Learn Languages by Mondly,EDUCATION,4.4,2078,Varies with device,"100,000+",Free,0,Everyone,Education;Education,"December 24, 2017",1.0.2,4.1 and up
95,850,Blinkist - Nonfiction Books,EDUCATION,4.1,16103,13M,"1,000,000+",Free,0,Everyone,Education,"July 31, 2018",5.7.1,4.1 and up
96,853,Toca Life: City,EDUCATION,4.7,31085,24M,"500,000+",Paid,$3.99,Everyone,Education;Pretend Play,"July 6, 2018",1.5-play,4.4 and up


In [14]:
#Давайте уберем лишние символы из колонки с ценой (price), чтобы далее с ней было удобнее работать, 
#и переведем данные в тип float.

playstore['price'] = playstore['price'].str.replace('$','').astype(float)

  playstore['price'] = playstore['price'].str.replace('$','').astype(float)


In [15]:
#Сделаем сводную таблицу по данным 

pivot = playstore.copy()
pivot.rename(columns = {'price': 'mean_price', 'rating':'mean_rating', 'reviews':'mean_reviews'}, inplace = True)
pivot = pd.pivot_table(pivot, 
              index = ['category','type'],
              values = ['mean_price', 'mean_rating', 'mean_reviews'],
              #columns = ['mean_price', 'mean_rating', 'mean_reviews'],
              aggfunc = ['mean']
              )
pivot.columns = pivot.columns.droplevel(0)
pivot['mean_price'] = pivot['mean_price'].round(2)
pivot['mean_reviews'] = pivot['mean_reviews'].round(2)
pivot['mean_rating'] = pivot['mean_rating'].round(1)
pivot.to_csv('pivot.csv', index = True)
pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_price,mean_rating,mean_reviews
category,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ART_AND_DESIGN,Free,0.00,4.3,23230.11
ART_AND_DESIGN,Paid,1.99,4.7,722.00
AUTO_AND_VEHICLES,Free,0.00,4.2,14140.28
AUTO_AND_VEHICLES,Paid,4.49,4.6,1387.67
BEAUTY,Free,0.00,4.3,7476.23
...,...,...,...,...
TRAVEL_AND_LOCAL,Paid,4.16,4.1,1506.08
VIDEO_PLAYERS,Free,0.00,4.0,424347.18
VIDEO_PLAYERS,Paid,2.62,4.1,3341.75
WEATHER,Free,0.00,4.2,171249.62
