### In this notebook I analyse the descriptions of the films and TV shows, first by rating and then by country.
#### I use some tools from the nltk package to clean up the descriptions, and plotly for the visualizations.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


## What are the most used words in the descriptions?

In [5]:
def desc_list(description):
    '''Takes a string, removes punctuation and stopwords and returns a list of words.'''
    tokenizer = RegexpTokenizer(r'\w+')
    y = tokenizer.tokenize(description.lower())
    l = [word for word in y if not word in stopwords.words('english')]
    return l

In [6]:
df['desc_lists'] = df['description'].apply(desc_list)

In [7]:
df['desc_lists']

0       [planning, awesome, wedding, grandfather, pola...
1       [jandino, asporaat, riffs, challenges, raising...
2       [help, three, human, allies, autobots, protect...
3       [prison, ship, crash, unleashes, hundreds, dec...
4       [nerdy, high, schooler, dani, finally, attract...
                              ...                        
6229    [parody, first, person, shooter, games, milita...
6230    [marc, maron, stars, marc, maron, interviews, ...
6231    [nursery, rhymes, original, music, children, a...
6232    [set, russian, revolution, comic, miniseries, ...
6233    [hit, sitcom, follows, merry, misadventures, s...
Name: desc_lists, Length: 6234, dtype: object

In [8]:
# We merge all the lists into one
all_words = []
for l in df.desc_lists:
    all_words.extend(l)

In [9]:
# Prepare the data for visualization
word_counts = pd.DataFrame(Counter(all_words), index=[0]).T.reset_index()
word_counts.rename(columns={'index':'word', 0:'count'}, inplace=True)
word_counts = word_counts.sort_values(by='count', ascending=False)

In [10]:
fig = px.bar(word_counts.iloc[:30], y='count', x='word', text='count')

fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=5)
fig.update_xaxes(tickangle=45)

fig.show()

## What about most common words by audience suitability rating?

In [11]:
df['rating'].value_counts()

TV-MA       2027
TV-14       1698
TV-PG        701
R            508
PG-13        286
NR           218
PG           184
TV-Y7        169
TV-G         149
TV-Y         143
TV-Y7-FV      95
G             37
UR             7
NC-17          2
Name: rating, dtype: int64

In [12]:
sorted(set(df['rating'].dropna()))

['G',
 'NC-17',
 'NR',
 'PG',
 'PG-13',
 'R',
 'TV-14',
 'TV-G',
 'TV-MA',
 'TV-PG',
 'TV-Y',
 'TV-Y7',
 'TV-Y7-FV',
 'UR']

In [13]:
# Create a dictionary with the rating categories as keys and dataframes with all the word counts for each rating as values.
d_ratings = {}
for rating in sorted(set(df['rating'].dropna())):
    if pd.isnull(rating):
        continue
    else:
        l = []
        for subl in df[df['rating'] == rating]['desc_lists']:
            l.extend(subl)
        dff = pd.DataFrame(Counter(l), index=[0]).T/df['rating'].value_counts()[rating] # we devide the word count by the total number of titles in each rating
        d_ratings[rating] = dff.reset_index().rename(columns={'index':'word', 0:'count'}).sort_values(by='count', ascending=True)

In [14]:
for key, val in d_ratings.items():
    print(key)
    print(val.iloc[-5:])
    print('\n')

G
        word     count
35      save  0.108108
424     half  0.108108
18       get  0.108108
224  friends  0.108108
145    world  0.189189


NC-17
        word  count
2     friend    0.5
1       best    0.5
28    piques    0.5
29  interest    0.5
7    student    1.0


NR
       word     count
32      new  0.077982
0       man  0.077982
249  family  0.082569
143    life  0.082569
15    young  0.087156


PG
       word     count
158    help  0.076087
147     new  0.097826
29    young  0.097826
288  family  0.097826
58    world  0.114130


PG-13
      word     count
98    must  0.080420
146   life  0.083916
1    young  0.087413
52   world  0.087413
30     new  0.094406


R
      word     count
96     two  0.062992
102  woman  0.062992
159  young  0.078740
57     new  0.092520
508   life  0.102362


TV-14
       word     count
244  family  0.082450
163     man  0.090106
19     love  0.091873
173   young  0.098940
41     life  0.098940


TV-G
        word     count
267   school  0.067114
8

In [15]:
# Plotting the top 10 words for each rating
fig = make_subplots(rows=4, 
                    cols=4, 
                    subplot_titles= [key + ' (' + str(df['rating'].value_counts()[key]) + ' titles)' for key in d_ratings.keys()], 
                    horizontal_spacing=0.08)

counter=0
for r in range(1,5):
    for c in range(1,5):
        if r==4 and c>2:
            break
        else:
            fig.add_trace(go.Bar(y=list(d_ratings.items())[counter][1].iloc[-10:]['word'],
                                 x=list(d_ratings.items())[counter][1].iloc[-10:]['count'],
                                 text=[round(num, 2) for num in list(d_ratings.items())[counter][1].iloc[-10:]['count']],
                                 orientation='h'),
                            row = r,
                            col = c, )
            counter += 1
    
fig.update_layout(height=1500, width=1200, showlegend=False)
fig.update_traces(textposition='inside', textfont_size=20)
fig.show()

### A few takeaways:
1. For TV shows, the word 'love' is more common in the TV-PG and TV-14 ratings, while it's not even in the top 10 in the TV-MA (Mature Audience) category.
2. In TV-Y shows which are aimed at very young audiences, it's all about 'friends', 'fun' and 'adventures'.
3. In the TV-Y7 category about 10% of the shows have the word evil in their description.
4. For films, the R-rated category is the only one with the word 'woman' in its most common words.

## What about different countries?

In [16]:
df['country'].value_counts()[:10]

United States     2032
India              777
United Kingdom     348
Japan              176
Canada             141
South Korea        136
Spain              117
France              90
Mexico              83
Turkey              79
Name: country, dtype: int64

### We'll only use the top 6 countries.

In [17]:
countries = list(df['country'].value_counts()[:6].index)
countries

['United States', 'India', 'United Kingdom', 'Japan', 'Canada', 'South Korea']

In [18]:
d_countries = {}
for country in countries:
    l = []
    for subl in df[df['country'] == country]['desc_lists']:
        l.extend(subl)
    dff = pd.DataFrame(Counter(l), index=[0]).T/df['country'].value_counts()[country]
    d_countries[country] = dff.reset_index().rename(columns={'index':'word', 0:'count'}).sort_values(by='count', ascending=True)

In [19]:
for key, val in d_countries.items():
    print(key)
    print(val.iloc[-5:])
    print('\n')

United States
            word     count
252       series  0.057579
55   documentary  0.059547
160        world  0.066929
162         life  0.100394
18           new  0.104331


India
       word     count
113  family  0.087516
130    love  0.093951
109   woman  0.106821
50      man  0.146718
49    young  0.151866


United Kingdom
            word     count
53        family  0.074713
467  documentary  0.074713
564        world  0.086207
114         life  0.094828
440       series  0.106322


Japan
           word     count
19   mysterious  0.085227
94        world  0.113636
419      school  0.125000
30         high  0.136364
91        young  0.153409


Canada
       word     count
21   series  0.056738
628  school  0.070922
191     new  0.070922
160     two  0.092199
228   world  0.092199


South Korea
      word     count
37    love  0.095588
127    two  0.095588
39    life  0.110294
46   korea  0.125000
36     new  0.147059




In [20]:
fig = make_subplots(rows=3,
                    cols=2,
                    subplot_titles=[key + '(' + str(df['country'].value_counts()[key]) + ' titles)' for key in d_countries.keys()],
                    vertical_spacing=0.1)
counter_=0
for r in range(1,4):
    for c in range(1,3):
        fig.add_trace(go.Bar(y=list(d_countries.items())[counter_][1].iloc[-10:]['word'],
                             x=list(d_countries.items())[counter_][1].iloc[-10:]['count'],
                             text=[round(num, 2) for num in list(d_countries.items())[counter_][1].iloc[-10:]['count']],
                             orientation='h'),
                        row = r,
                        col = c, )
        counter_ += 1
    
    
fig.update_layout(height=1000, width=1000, showlegend=False)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()

### A few takeaways:
1. There are proportionately more Indian and Japanese titles in this dataset with the word 'young' in their description.
2. India and South Korea are the only countries with the word 'woman' in their top 10 most common words of descriptions.
3. Japanese movies and shows are more 'mysterious' than those of other countries.
4. A lot of the action in Indian films and TV shows seems to take place in Mumbai.
5. The US and the UK have proportionately more documentaries in this dataset than the other top countries.