In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


In [2]:
df = pd.read_csv("apps.csv")
df.head()


Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
0,Ak Parti Yardım Toplama,SOCIAL,,0,8.7,0,Paid,$13.99,Teen,Social,"July 28, 2017",4.1 and up
1,Ain Arabic Kids Alif Ba ta,FAMILY,,0,33.0,0,Paid,$2.99,Everyone,Education,"April 15, 2016",3.0 and up
2,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,,0,5.5,0,Paid,$1.49,Everyone,Personalization,"July 11, 2018",4.2 and up
3,Command & Conquer: Rivals,FAMILY,,0,19.0,0,,0,Everyone 10+,Strategy,"June 28, 2018",Varies with device
4,CX Network,BUSINESS,,0,10.0,0,Free,0,Everyone,Business,"August 6, 2018",4.1 and up


In [3]:
df.shape


(10841, 12)

In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size_MBs        10841 non-null  float64
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content_Rating  10841 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last_Updated    10841 non-null  object 
 11  Android_Ver     10839 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 1016.5+ KB


In [6]:
df.describe()


Unnamed: 0,Rating,Reviews,Size_MBs
count,9367.0,10841.0,10841.0
mean,4.191513,444111.9,19.774147
std,0.515735,2927629.0,21.404354
min,1.0,0.0,0.008301
25%,4.0,38.0,4.9
50%,4.3,2094.0,11.0
75%,4.5,54768.0,27.0
max,5.0,78158310.0,100.0


In [7]:
df.isnull().sum()


App                  0
Category             0
Rating            1474
Reviews              0
Size_MBs             0
Installs             0
Type                 1
Price                0
Content_Rating       0
Genres               0
Last_Updated         0
Android_Ver          2
dtype: int64

In [8]:
df.dropna(inplace=True)


In [9]:
df = df[df["Rating"] <= 5]


In [10]:
df["Installs"] = (
    df["Installs"]
    .str.replace(",", "")
    .str.replace("+", "")
    .astype(int)
)


In [11]:
df["Price"] = df["Price"].str.replace("$", "").astype(float)


In [12]:
category_count = df["Category"].value_counts()
category_count


Category
FAMILY                 1747
GAME                   1097
TOOLS                   734
PRODUCTIVITY            351
MEDICAL                 350
COMMUNICATION           328
FINANCE                 323
SPORTS                  319
PHOTOGRAPHY             317
LIFESTYLE               315
PERSONALIZATION         312
BUSINESS                303
HEALTH_AND_FITNESS      297
SOCIAL                  259
SHOPPING                238
NEWS_AND_MAGAZINES      233
TRAVEL_AND_LOCAL        226
DATING                  195
BOOKS_AND_REFERENCE     178
VIDEO_PLAYERS           160
EDUCATION               155
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     124
FOOD_AND_DRINK          109
HOUSE_AND_HOME           76
WEATHER                  75
AUTO_AND_VEHICLES        73
LIBRARIES_AND_DEMO       65
ART_AND_DESIGN           62
COMICS                   58
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: count, dtype: int64

In [13]:
category_installs = (
    df.groupby("Category")["Installs"]
    .sum()
    .sort_values(ascending=False)
)
category_installs


Category
GAME                   35085862717
COMMUNICATION          32647241530
PRODUCTIVITY           14176070180
SOCIAL                 14069841475
TOOLS                  11450724500
FAMILY                 10257701590
PHOTOGRAPHY            10088243130
NEWS_AND_MAGAZINES      7496210650
TRAVEL_AND_LOCAL        6868859300
VIDEO_PLAYERS           6221897200
SHOPPING                3247831540
ENTERTAINMENT           2869160000
PERSONALIZATION         2325341930
BOOKS_AND_REFERENCE     1921291655
SPORTS                  1751131465
HEALTH_AND_FITNESS      1583056220
BUSINESS                1001768120
FINANCE                  876612400
EDUCATION                871352000
MAPS_AND_NAVIGATION      724267560
LIFESTYLE                537562120
WEATHER                  426096500
FOOD_AND_DRINK           273777750
DATING                   264295110
HOUSE_AND_HOME           168582000
ART_AND_DESIGN           124233100
LIBRARIES_AND_DEMO        62083000
COMICS                    56036100
MEDICAL    

In [14]:
fig = px.bar(
    x=category_installs.index,
    y=category_installs.values,
    title="Total Installs by Category",
    labels={"x": "Category", "y": "Total Installs"}
)
fig.show()


In [15]:
df["Type"].value_counts()


Type
Free    8719
Paid     646
Name: count, dtype: int64

In [16]:
fig = px.box(
    df,
    x="Type",
    y="Rating",
    title="App Ratings: Free vs Paid"
)
fig.show()


In [17]:
fig = px.scatter(
    df,
    x="Reviews",
    y="Installs",
    color="Type",
    title="Reviews vs Installs"
)
fig.show()


In [18]:
paid_apps = df[df["Type"] == "Paid"]

fig = px.scatter(
    paid_apps,
    x="Price",
    y="Rating",
    title="Price vs Rating (Paid Apps)"
)
fig.show()


In [23]:
df.columns


Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Last_Updated', 'Android_Ver'],
      dtype='object')

In [24]:
content_rating = df["Content_Rating"].value_counts()
content_rating


Content_Rating
Everyone           7419
Teen               1084
Mature 17+          461
Everyone 10+        397
Adults only 18+       3
Unrated               1
Name: count, dtype: int64

In [25]:
fig = px.pie(
    values=content_rating.values,
    names=content_rating.index,
    title="Content Rating Distribution"
)
fig.show()


In [29]:
df["Last_Updated"] = pd.to_datetime(df["Last_Updated"])

updates_per_year = df["Last_Updated"].dt.year.value_counts().sort_index()
updates_per_year


Last_Updated
2010       1
2011      15
2012      21
2013      94
2014     190
2015     388
2016     651
2017    1498
2018    6507
Name: count, dtype: int64

In [30]:
fig = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    title="Number of App Updates per Year"
)
fig.show()
