# Preprocessing Search History

## Import Library

In [1]:
import pandas as pd
import json
pd.options.mode.copy_on_write = True 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load Data

In [2]:
with open('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/history/search-history.json',encoding='utf-8') as f:
    df = pd.DataFrame.from_records((json.load(f)))

## Data Info

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   header            62 non-null     object
 1   title             62 non-null     object
 2   titleUrl          62 non-null     object
 3   time              62 non-null     object
 4   products          62 non-null     object
 5   activityControls  62 non-null     object
 6   description       42 non-null     object
 7   details           42 non-null     object
dtypes: object(8)
memory usage: 4.0+ KB


## Statistics Description

In [4]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
header,62,1,YouTube,62
title,62,58,Searched for statquest,3
titleUrl,62,58,https://www.youtube.com/results?search_query=s...,3
time,62,62,2024-03-13T15:09:56.510Z,1
products,62,1,[YouTube],62
activityControls,62,2,"[Web & App Activity, YouTube watch history, Yo...",42
description,42,40,Watched at 7:52 PM,2
details,42,1,[{'name': 'From Google Ads'}],42


## Missing Value

In [5]:
df.isna().sum()

header               0
title                0
titleUrl             0
time                 0
products             0
activityControls     0
description         20
details             20
dtype: int64

## Value Count

In [6]:
for i in df.columns:
    print(f'\n------ Column {i}------')
    print(f'------ Total Lenght {len(df[i])}------')
    print(df[i].value_counts())


------ Column header------
------ Total Lenght 62------
header
YouTube    62
Name: count, dtype: int64

------ Column title------
------ Total Lenght 62------
title
Searched for statquest                                                                         3
Watched 【Astral Tale】Classic MMORPG is Back with Global Servers!                               2
Searched for shinmei                                                                           2
Watched Perbandingan Sewa vs Beli                                                              1
Watched Ramadan 2024 2024 15 Games Nutrimart                                                   1
Watched The future of testing, here today                                                      1
Watched FLOYA - Yume (OFFICIAL VIDEO)                                                          1
Watched YouTube Premium EMU MIX 1mon Alot Stock 12s horiz 1920x1080 16-9 id ID IDR             1
Watched WordPress Hosting | Hostinger.co.id               

## Header Column

### Drop Header Column

In [7]:
df.drop(columns='header',inplace=True)
df.head(5)

Unnamed: 0,title,titleUrl,time,products,activityControls,description,details
0,Searched for kaca biru,https://www.youtube.com/results?search_query=k...,2024-03-13T15:09:56.510Z,[YouTube],[YouTube search history],,
1,Searched for statquest,https://www.youtube.com/results?search_query=s...,2024-03-13T14:18:15.610Z,[YouTube],[YouTube search history],,
2,Watched Adonan Sulit Tercampur?,https://www.youtube.com/watch?v=IBokV9_H_W8,2024-03-13T13:34:04.867Z,[YouTube],"[Web & App Activity, YouTube watch history, Yo...",Watched at 8:34 PM,[{'name': 'From Google Ads'}]
3,Watched #BeraniLebih Lawan Minyak Kayak Asnawi...,https://www.youtube.com/watch?v=eOas12RHAQ4,2024-03-13T13:33:57.266Z,[YouTube],"[Web & App Activity, YouTube watch history, Yo...",Watched at 8:33 PM,[{'name': 'From Google Ads'}]
4,Watched Boost Teamwork | Get Your Tone Just Right,https://www.youtube.com/watch?v=Jbir6yDKZUE,2024-03-13T13:08:35.477Z,[YouTube],"[Web & App Activity, YouTube watch history, Yo...",Watched at 8:08 PM,[{'name': 'From Google Ads'}]


## Details Column

In [8]:
df['details'].value_counts()

details
[{'name': 'From Google Ads'}]    42
Name: count, dtype: int64

In [9]:
df = df[df['details'].isna()]

In [10]:
df.drop(columns='details',inplace=True)

## Products Column

### Drop Products Column

In [11]:
df.drop(columns=['products'],inplace=True)

## activityControls Column

In [12]:
df['activityControls'].value_counts()

activityControls
[YouTube search history]    20
Name: count, dtype: int64

In [13]:
df['activityControls'] = df['activityControls'].apply(lambda row: row[0])

## Description Column

In [14]:
df['description'].value_counts()

Series([], Name: count, dtype: int64)

In [15]:
df.drop(columns='description',inplace=True)

## Time Column

In [16]:
df['time'] = pd.to_datetime(df['time'])

## Spliting Query From titleUrl

### Check Value Format

In [17]:
df['titleUrl'].loc[0]

'https://www.youtube.com/results?search_query=kaca+biru'

### Spliting

In [18]:
df['query'] = df['titleUrl'].apply(lambda row: row[45:])

## Final Preview Data

In [19]:
df

Unnamed: 0,title,titleUrl,time,activityControls,query
0,Searched for kaca biru,https://www.youtube.com/results?search_query=k...,2024-03-13 15:09:56.510000+00:00,YouTube search history,kaca+biru
1,Searched for statquest,https://www.youtube.com/results?search_query=s...,2024-03-13 14:18:15.610000+00:00,YouTube search history,statquest
5,Searched for frieren,https://www.youtube.com/results?search_query=f...,2024-03-13 13:06:36.537000+00:00,YouTube search history,frieren
6,Searched for kobo kanaeru,https://www.youtube.com/results?search_query=k...,2024-03-13 13:03:03.778000+00:00,YouTube search history,kobo+kanaeru
7,Searched for muse in classroom of the elite,https://www.youtube.com/results?search_query=m...,2024-03-13 13:00:56.703000+00:00,YouTube search history,muse+in+classroom+of+the+elite+
16,Searched for stardenburdenhardenbart,https://www.youtube.com/results?search_query=s...,2024-03-13 09:12:49.769000+00:00,YouTube search history,stardenburdenhardenbart
25,Searched for shinmei,https://www.youtube.com/results?search_query=s...,2024-03-12 20:37:02.916000+00:00,YouTube search history,shinmei
29,Searched for muse indonesia frieren,https://www.youtube.com/results?search_query=m...,2024-03-12 19:44:35.049000+00:00,YouTube search history,muse+indonesia+frieren
38,Searched for shinmei,https://www.youtube.com/results?search_query=s...,2024-03-12 12:36:57.581000+00:00,YouTube search history,shinmei
39,Searched for toho color,https://www.youtube.com/results?search_query=t...,2024-03-12 07:38:58.277000+00:00,YouTube search history,toho+color


## Save Data

In [20]:
df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/preprocessed/search_history.csv',index=False)