# **TikTok Project**


### Install, Import Modules, Load Data

In [None]:
# Install seaborn
%pip install seaborn

# Import packages for data manipulation
import numpy as np
import pandas as pd

# Import packages for data visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset into dataframe
df = pd.read_csv("tiktok_dataset.csv")

### **Data exploration and cleaning**

Use functions that help understand and structure the data.

*    `.head()`
*    `.info()`
*    `.describe()`
*    `.size`
*    `.shape`

In [None]:
# Display and examine the first few rows of the dataframe
df.head()

In [None]:
# Get the size of the data
df.size

In [None]:
# Get the shape of the data
df.shape

In [None]:
# Get basic information about the data
df.info()

In [None]:
# Generate a table of descriptive statistics
df.describe()

### **Build visualizations**



#### **video_duration_sec**

Create a box plot to examine the spread of values in the `video_duration_sec` column.

In [None]:
# Create a boxplot to visualize distribution of `video_duration_sec`
plt.figure(figsize = (5,1)
sns.boxplot(data=df,
           x = 'video_duration_sec')
plt.title('Video Duration Boxplot')
plt.xlabel('Video Duration')
plt.show

Create a histogram of the values in the `video_duration_sec` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
df['video_duration_sec'].value_counts()
plt.figure(figsize=(10,6))
ax = sns.histplot(data = df,
            x = 'video_duration_sec',
            bins = range(5,66,5))
ax.set_xticks(range(5,66,5))
plt.title('Video Duration Histogram')
plt.xlabel('Video Duration As Second')
plt.show

#### **video_view_count**

Create a box plot to examine the spread of values in the `video_view_count` column.

In [None]:
# Create a boxplot to visualize distribution of `video_view_count`
sns.boxplot(data=df,
           x='video_view_count')
plt.title('Vide View Count Boxplot')
plt.xlabel('Video View Count')
plt.show

Create a histogram of the values in the `video_view_count` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
sns.histplot(data = df,
            x = 'video_view_count',
            bins = range(0,10**6+1,10**5))
plt.xlabel('Video View Count')
plt.title('Video View Count Histogram')
plt.show

#### **video_like_count**

Create a box plot to examine the spread of values in the `video_like_count` column.

In [None]:
# Create a boxplot to visualize distribution of `video_like_count`
sns.boxplot(x = df["video_like_count"])
plt.title('Video Like Count Boxplot')
plt.xlabel('Video Like Count')
plt.show

Create a histogram of the values in the `video_like_count` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
plt.figure(figsize=(10,6))
sns.histplot(x= df['video_like_count'],
            bins = range(0,7*10**5+1,10**5))
plt.xlabel('Video Like Count')
plt.title('Video Like Count Histogram')
plt.show

#### **video_comment_count**

Create a box plot to examine the spread of values in the `video_comment_count` column.

In [None]:
# Create a boxplot to visualize distribution of `video_comment_count`
plt.figure(figsize=(10,1))
sns.boxplot(df['video_comment_count'])
plt.title('Video Comment Count Boxplot')
plt.xlabel('Video Comment Count')
plt.show

Create a histogram of the values in the `video_comment_count` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
plt.figure(figsize=(10,5))
sns.histplot(df['video_comment_count'],
            bins = range(0,10**4+1,10**3))
plt.xlabel('Vide Comment Count')
plt.title('Video Comment Count Histogram')
plt.show

#### **video_share_count**

Create a box plot to examine the spread of values in the `video_share_count` column.

In [None]:
# Create a boxplot to visualize distribution of `video_share_count`
sns.boxplot(df['video_share_count'])
plt.xlabel('Video Share Count')
plt.title("Video Share Count Boxplot")
plt.show

*Create* a histogram of the values in the `video_share_count` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
plt.figure(figsize=(10,5))
sns.histplot(df['video_share_count'],
            bins = range(0,27*10**4,10**4))
plt.xlabel('Video Share Count')
plt.title('Video Share Count Histogram')
plt.show

#### **video_download_count**

Create a box plot to examine the spread of values in the `video_download_count` column.

In [None]:
# Create a boxplot to visualize distribution of `video_download_count`
sns.boxplot(df['video_download_count'])
plt.xlabel('Video Download Count')
plt.title('Video Download Count Boxplot')
plt.show

Create a histogram of the values in the `video_download_count` column to further explore the distribution of this variable.

In [None]:
# Create a histogram
plt.figure(figsize = (10,5))
sns.histplot(df['video_download_count'],
            bins = range(0,16*10**3+1,10**3))
plt.xlabel('Video Download Count')
plt.title('Video Download Count Histogram')
plt.show

#### **Claim status by verification status**

Now, create a histogram with four bars: one for each combination of claim status and verification status.

In [None]:
# Create a histogram
plt.figure(figsize=(10,6))
sns.histplot(data = df,
            x = 'claim_status',
            hue = 'verified_status',
            multiple = 'dodge',
            shrink = 0.8)
plt.xlabel('Claim Status')
plt.title('Claims By Verified Status Histogram')
plt.show

#### **Claim status by author ban status**

The previous course used a `groupby()` statement to examine the count of each claim status for each author ban status. Now, use a histogram to communicate the same information.

In [None]:
# Create a histogram
plt.figure(figsize=(10,6))
sns.histplot(data = df,
            x = 'claim_status',
            hue = 'author_ban_status',
            multiple = 'dodge',
            hue_order = ['active', 'under review','banned'],
            shrink = 0.8)
plt.xlabel('Claim Status')
plt.title('Claims By Author Ban Status Histogram')
plt.show

#### **Median view counts by ban status**

Create a bar plot with three bars: one for each author ban status. The height of each bar should correspond with the median number of views for all videos with that author ban status.

In [None]:
# Create a bar plot
median_view_count = df.groupby('author_ban_status')[['video_view_count']].median().reset_index()
median_view_count

plt.figure(figsize = (10,6))
sns.barplot(data = median_view_count,
           x = 'author_ban_status',
           y = 'video_view_count',
           order = ['active', 'under review', 'banned'])
plt.title('Median View Count By Author Ban Status')
plt.xlabel('Author Ban Status')
plt.ylabel('Median Video View')

In [None]:
# Calculate the median view count for claim status.
median_view_count_claim = df.groupby('claim_status')[['video_view_count']].median().reset_index()
median_view_count_claim

#### **Total views by claim status**

Create a pie graph that depicts the proportions of total views for claim videos and total views for opinion videos.

In [None]:
# Create a pie graph
### YOUR CODE HERE ###
total_view_claim = df.groupby('claim_status')[['video_view_count']].sum()

plt.pie(total_view_claim,
       labels = ['Claim','Opinion'])
plt.title('Total View By Claim Status Piechart')
plt.show

### **Determine outliers**

When building predictive models, the presence of outliers can be problematic. 

A common way to determine outliers in a normal distribution is to calculate the interquartile range (IQR) and set a threshold that is 1.5 * IQR above the 3rd quartile.

In this TikTok dataset, the values for the count variables are not normally distributed. They are heavily skewed to the right. One way of modifying the outlier threshold is by calculating the **median** value for each variable and then adding 1.5 * IQR. This results in a threshold that is, in this case, much lower than it would be if you used the 3rd quartile.

Write a for loop that iterates over the column names of each count variable. For each iteration:
1. Calculate the IQR of the column
2. Calculate the median of the column
3. Calculate the outlier threshold (median + 1.5 * IQR)
4. Calculate the numer of videos with a count in that column that exceeds the outlier threshold
5. Print "Number of outliers, {column name}: {outlier count}"

In [None]:
count_cols = ['video_view_count',
              'video_like_count',
              'video_share_count',
              'video_download_count',
              'video_comment_count',
              ]

for i in count_cols:
    q1 = df[i].quantile(0.25)
    q3 = df[i].quantile(0.75)
    iqr = q3 - q1
    median = df[i].median()
    outlier_threshold = median + 1.5*iqr

    # Count the number of values that exceed the outlier threshold
    outlier_count = (df[i] > outlier_threshold).sum()
    print(f'Number of outliers, {i}:', outlier_count)

#### **Scatterplot**

In [None]:
# Create a scatterplot of `video_view_count` versus `video_like_count` according to 'claim_status'
sns.scatterplot(data = df,
                x = 'video_view_count',
                y = 'video_like_count',
               hue = 'claim_status',
               alpha = .3,
               s = 10)
plt.xlabel('Video View Count')
plt.ylabel('Video Like Count')
plt.title('Video View And Like Count By Claim Status Scatterplot ')
plt.show


In [None]:
# Create a scatterplot of ``video_view_count` versus `video_like_count` for opinions only
df_opinion = df[df['claim_status'] == 'opinion']

sns.scatterplot(data = df_opinion,
                x = 'video_view_count',
                y = 'video_like_count',
                alpha = .3,
                s = 10)
plt.xlabel('Video View Count')
plt.ylabel('Video Like Count')
plt.title('Video View And Like Count By Claim Status Scatterplot ')
plt.show



You can do a scatterplot in Tableau Public as well, which can be easier to manipulate and present. If you'd like step by step instructions, you can review the instructions linked in the previous Activity page.