# Load Datasets

In [1]:
import pandas as pd
import sys
import os
import csv
# Add new system path to import config file
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../")))
# Build path two levels up
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../../"))
import config

# Users aka. Devices

In [2]:
users = pd.read_csv(os.path.join(base_dir, config.users))
users.head()

Unnamed: 0,deviceid,platform,os_version,model,networkType,district,lastknownsubadminarea,language_selected,created_datetime,app_updated_at,last_active_at
0,197b123e-eb9e-4fc1-a32d-aa86aaea425e,ANDROID,13.0,,4G,,,en,2023-07-11T13:40:05.511Z,,2023-07-11T13:40:02.000Z
1,3c33c537-7c6c-40f5-835c-f997e883cae2,ANDROID,13.0,,4G,,,en,2023-07-11T15:36:58.363Z,,2023-07-11T16:24:44.000Z
2,6c7be5d0-d4d8-469f-91be-8055021ceef9,ANDROID,12.0,,NO INTERNET,,,en,2023-07-11T11:25:39.375Z,,2023-07-11T10:13:18.000Z
3,0801af66-0a6f-4fdd-82a9-c2b15757b8f5,ANDROID,7.0,,NO INTERNET,,,en,2023-04-30T00:24:48.987Z,,2023-07-10T11:53:21.000Z
4,78b3c7a7-5881-42dc-9f8e-b4fc27f94360,ANDROID,13.0,,4G,,,en,2023-04-15T05:48:46.923Z,,2023-07-20T15:37:47.000Z


In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10400 entries, 0 to 10399
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   deviceid               10400 non-null  object
 1   platform               10400 non-null  object
 2   os_version             10400 non-null  object
 3   model                  10344 non-null  object
 4   networkType            10352 non-null  object
 5   district               21 non-null     object
 6   lastknownsubadminarea  9492 non-null   object
 7   language_selected      10400 non-null  object
 8   created_datetime       10400 non-null  object
 9   app_updated_at         7236 non-null   object
 10  last_active_at         10400 non-null  object
dtypes: object(11)
memory usage: 893.9+ KB


# Events

In [4]:
events = pd.read_csv(os.path.join(base_dir, config.events_list[0]))
# Concatenate vertically (row-wise)
for i in range(1, len(config.events_list)):
    temp_df = pd.read_csv(os.path.join(base_dir, config.events_list[i]))
    events = pd.concat([events, temp_df], ignore_index=True)

In [5]:
events.head()

Unnamed: 0,deviceId,event_type,eventTimestamp,hashId,categoryWhenEventHappened,cardViewPosition,overallTimeSpent,searchTerm,relevancy_color,relevancy_topic,state,locality,district
0,7cb933f6-0a5b-477e-bb5d-42eb4b771970,TimeSpent-Front,1689189015000,2k4lruyx-1,My Feed,13.0,2.165,,,,,,
1,a1bfc2e4-c03e-4c1b-8abb-215808a89e6a,TimeSpent-Front,1689182865000,2k4lruyx-1,My Feed,17.0,3.306,,,,,,
2,1c53a149-303d-486e-ac62-0b9c9e469cda,TimeSpent-Front,1689180216000,2k4lruyx-1,My Feed,17.0,4.695,,,,,,
3,fea7a467-551c-4b64-ad48-eae0693635f3,TimeSpent-Front,1689173651000,2kk4ydgg-1,My Feed,25.0,4.238,,,,,,
4,1ef62ab0-e5da-4cb4-9839-49ac52f04ceb,TimeSpent-Front,1689174004000,2kk4ydgg-1,My Feed,12.0,0.931,,,,,,


In [6]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3544161 entries, 0 to 3544160
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   deviceId                   object 
 1   event_type                 object 
 2   eventTimestamp             int64  
 3   hashId                     object 
 4   categoryWhenEventHappened  object 
 5   cardViewPosition           float64
 6   overallTimeSpent           float64
 7   searchTerm                 object 
 8   relevancy_color            object 
 9   relevancy_topic            object 
 10  state                      object 
 11  locality                   object 
 12  district                   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 351.5+ MB


# Contents

In [7]:
def count_lines(path):
    with open(path, encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)
    df = pd.read_csv(path, on_bad_lines='skip')
    return df, total_lines

train_path = os.path.join(base_dir, config.training)
test_path = os.path.join(base_dir, config.testing)

train_df, train_lines = count_lines(train_path)
test_df, test_lines = count_lines(test_path)

# Calculate percentage of bad lines (excluding header)
train_bad_pct = ((train_lines - 1 - len(train_df)) / (train_lines - 1)) * 100
test_bad_pct = ((test_lines - 1 - len(test_df)) / (test_lines - 1)) * 100

print(f"Bad lines in train: {train_bad_pct:.2f}%")
print(f"Bad lines in test: {test_bad_pct:.2f}%")


Bad lines in train: 44.31%
Bad lines in test: 28.31%


In [8]:
train_df.head()

Unnamed: 0,hashid,title,content,newsType,author,categories,hashtags,newsDistrict,createdAt,updatedAt,newsLanguage,sourceName
0,q6ymyudu-1,The world's most picturesque road trip awaits ...,The world's most picturesque road trip awaits ...,VIDEO_NEWS,5748e05c947ce445479635a0,travel,,,2023-06-04T08:30:15.000Z,2023-06-04T08:30:15.000Z,english,
1,vowkjbc6-1,"In a viral video, Kathak dancers perform in a ...","In a viral video, Kathak dancers perform in a ...",VIDEO_NEWS,5748e05c947ce445479635a0,entertainment,,,2023-06-05T15:02:18.000Z,2023-06-05T15:02:18.000Z,english,
2,8bf90wsx-1,Would you like to be a part of the 'no wash' m...,Would you like to be a part of the 'no wash' m...,VIDEO_NEWS,5748e05c947ce445479635a0,hatke,,,2023-06-02T09:22:13.000Z,2023-06-02T09:22:13.000Z,english,
3,j0p0fukx-1,Switzerland's legendary Bernina Express turns 50,Switzerland's legendary Bernina Express turns 50,VIDEO_NEWS,5748e05c947ce445479635a0,travel,,,2023-06-04T13:24:40.000Z,2023-06-04T13:24:40.000Z,english,
4,naqusao7-1,"Explored by many, claimed by none, this is Bir...","Explored by many, claimed by none, this is Bir...",VIDEO_NEWS,5748e05c947ce445479635a0,"hatke,travel",,,2023-06-04T13:20:20.000Z,2023-06-04T13:20:20.000Z,english,


In [9]:
test_df.head()

Unnamed: 0,hashid,title,content,newsType,author,categories,hashtags,newsDistrict,createdAt,updatedAt,newsLanguage,sourceName
0,zdw0jrig-1,Redmi 12 5G will be a game-changer for 5G conn...,Xiaomi will debut Redmi 12 5G alongside Redmi ...,NEWS,593f9d1f81ef171ab3b63a2d,technology,,,2023-07-27T07:06:41.000Z,2023-07-27T07:06:41.000Z,english,Xiaomi
1,y5pfnbmp-1,Limited seats left for Hero Vired & MIT’s Prog...,Hero Group's EdTech company Hero Vired & MIT l...,NEWS,593f9d1f81ef171ab3b63a2d,education,,,2023-07-27T04:30:50.000Z,2023-07-27T04:30:50.000Z,english,vired.com
2,eo2eyhgk-1,Heavy to very heavy rainfall warning issued fo...,IMD has issued heavy to very heavy rainfall wa...,NEWS,5f70de9bd43821580e6d7022,national,,,2023-07-27T16:23:53.000Z,2023-07-27T17:12:11.285Z,english,IMD
3,fknyydal-1,Which 14 teams have qualified for 20-team T20 ...,Ireland and Scotland have qualified for the 20...,NEWS,5f70de9bd43821580e6d7022,sports,,,2023-07-27T14:29:59.000Z,2023-07-27T14:29:59.000Z,english,ICC
4,61ogen4w-1,42-year-old woman shot dead near her house in ...,A 42-year-old woman was shot dead near her hou...,NEWS,5f70de9bd43821580e6d7022,national,,,2023-07-27T17:07:31.000Z,2023-07-27T17:07:31.000Z,english,ABP


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8170 entries, 0 to 8169
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   hashid        8170 non-null   object
 1   title         8170 non-null   object
 2   content       8170 non-null   object
 3   newsType      7823 non-null   object
 4   author        8154 non-null   object
 5   categories    8145 non-null   object
 6   hashtags      1255 non-null   object
 7   newsDistrict  33 non-null     object
 8   createdAt     8155 non-null   object
 9   updatedAt     8155 non-null   object
 10  newsLanguage  8061 non-null   object
 11  sourceName    7971 non-null   object
dtypes: object(12)
memory usage: 766.1+ KB


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970 entries, 0 to 969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   hashid        970 non-null    object
 1   title         970 non-null    object
 2   content       970 non-null    object
 3   newsType      969 non-null    object
 4   author        968 non-null    object
 5   categories    967 non-null    object
 6   hashtags      37 non-null     object
 7   newsDistrict  313 non-null    object
 8   createdAt     968 non-null    object
 9   updatedAt     968 non-null    object
 10  newsLanguage  967 non-null    object
 11  sourceName    905 non-null    object
dtypes: object(12)
memory usage: 91.1+ KB
