In [30]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [31]:
DATA_DIR = Path("../data")
OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [32]:
posts_file = DATA_DIR / "the-reddit-dataset-dataset-posts.csv"
comments_file = DATA_DIR / "the-reddit-dataset-dataset-comments.csv"
decor_file = DATA_DIR / "cleaned_decor.csv"

for f in [posts_file, comments_file, decor_file]:
    print(f, "->", f.exists())

..\data\the-reddit-dataset-dataset-posts.csv -> True
..\data\the-reddit-dataset-dataset-comments.csv -> True
..\data\cleaned_decor.csv -> True


In [33]:
posts_df = pd.read_csv(posts_file)
comments_df = pd.read_csv(comments_file)
decor_df = pd.read_csv(decor_file)

In [34]:
print(posts_df.head())
print(comments_df.head())
print(decor_df.head())

   index  type      id subreddit.id subreddit.name  subreddit.nsfw  \
0      0  post  t4f9bf        2r97t       datasets           False   
1      1  post  t4euxw        2r97t       datasets           False   
2      2  post  t4e0bb        2r97t       datasets           False   
3      3  post  t49fq0        2r97t       datasets           False   
4      4  post  t47wiw        2r97t       datasets           False   

   created_utc                                          permalink  \
0   1646160815  https://old.reddit.com/r/datasets/comments/t4f...   
1   1646159793  https://old.reddit.com/r/datasets/comments/t4e...   
2   1646157650  https://old.reddit.com/r/datasets/comments/t4e...   
3   1646145753  https://old.reddit.com/r/datasets/comments/t49...   
4   1646141275  https://old.reddit.com/r/datasets/comments/t47...   

          domain  url                                           selftext  \
0  pravda.com.ua  NaN                                          [removed]   
1  self.data

In [35]:
print(posts_df.info())
print(posts_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20292 entries, 0 to 20291
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           20292 non-null  int64 
 1   type            20292 non-null  object
 2   id              20292 non-null  object
 3   subreddit.id    20292 non-null  object
 4   subreddit.name  20292 non-null  object
 5   subreddit.nsfw  20292 non-null  bool  
 6   created_utc     20292 non-null  int64 
 7   permalink       20292 non-null  object
 8   domain          20292 non-null  object
 9   url             4976 non-null   object
 10  selftext        15316 non-null  object
 11  title           20292 non-null  object
 12  score           20292 non-null  int64 
dtypes: bool(1), int64(3), object(9)
memory usage: 1.9+ MB
None
   index  type      id subreddit.id subreddit.name  subreddit.nsfw  \
0      0  post  t4f9bf        2r97t       datasets           False   
1      1  post  t4euxw     

In [36]:
print(comments_df.info())
print(comments_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54851 entries, 0 to 54850
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           54851 non-null  int64  
 1   type            54851 non-null  object 
 2   id              54851 non-null  object 
 3   subreddit.id    54851 non-null  object 
 4   subreddit.name  54848 non-null  object 
 5   subreddit.nsfw  54848 non-null  object 
 6   created_utc     54848 non-null  float64
 7   permalink       54848 non-null  object 
 8   body            54846 non-null  object 
 9   sentiment       47367 non-null  float64
 10  score           54845 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 4.6+ MB
None
   index     type       id subreddit.id subreddit.name subreddit.nsfw  \
0      0  comment  hyyz6g8        2r97t       datasets          False   
1      1  comment  hyyid7v        2r97t       datasets          False   
2      2  comment  h

In [37]:
print(decor_df.info())
print(decor_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10163 entries, 0 to 10162
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          10163 non-null  object
 1   brand         10163 non-null  object
 2   rating        10163 non-null  int64 
 3   price         10163 non-null  int64 
 4   actual price  10163 non-null  int64 
 5   discount      10163 non-null  int64 
 6   shipping      10163 non-null  object
 7   category      10163 non-null  object
 8   deals         10163 non-null  object
 9   emi           10163 non-null  object
 10  url           10163 non-null  object
dtypes: int64(4), object(7)
memory usage: 873.5+ KB
None
                                                name       brand  rating  \
0  Multicolour Color Sheesham MDF Floor Rested Ma...      D'Dass       4   
1    Dark Sheesham MDF Floor Rested Mandir With Door      D'Dass       4   
2  Copper Finish Sheesham Wood & MDF Floor Rested...      D'

In [38]:
print(posts_df.isnull().sum())
print(comments_df.isnull().sum())
print(decor_df.isnull().sum())

index                 0
type                  0
id                    0
subreddit.id          0
subreddit.name        0
subreddit.nsfw        0
created_utc           0
permalink             0
domain                0
url               15316
selftext           4976
title                 0
score                 0
dtype: int64
index                0
type                 0
id                   0
subreddit.id         0
subreddit.name       3
subreddit.nsfw       3
created_utc          3
permalink            3
body                 5
sentiment         7484
score                6
dtype: int64
name            0
brand           0
rating          0
price           0
actual price    0
discount        0
shipping        0
category        0
deals           0
emi             0
url             0
dtype: int64


In [39]:
print(posts_df.describe(include='all'))

               index   type      id subreddit.id subreddit.name  \
count   20292.000000  20292   20292        20292          20292   
unique           NaN      1   20292            1              1   
top              NaN   post  t4f9bf        2r97t       datasets   
freq             NaN  20292       1        20292          20292   
mean    10145.500000    NaN     NaN          NaN            NaN   
std      5857.940167    NaN     NaN          NaN            NaN   
min         0.000000    NaN     NaN          NaN            NaN   
25%      5072.750000    NaN     NaN          NaN            NaN   
50%     10145.500000    NaN     NaN          NaN            NaN   
75%     15218.250000    NaN     NaN          NaN            NaN   
max     20291.000000    NaN     NaN          NaN            NaN   

       subreddit.nsfw   created_utc  \
count           20292  2.029200e+04   
unique              1           NaN   
top             False           NaN   
freq            20292           NaN   


In [40]:
print(comments_df.describe(include='all'))

               index     type       id subreddit.id subreddit.name  \
count   54851.000000    54851    54851        54851          54848   
unique           NaN        4    54851            3              1   
top              NaN  comment  hyyz6g8        2r97t       datasets   
freq             NaN    54848        1        54848          54848   
mean    27425.000000      NaN      NaN          NaN            NaN   
std     15834.264145      NaN      NaN          NaN            NaN   
min         0.000000      NaN      NaN          NaN            NaN   
25%     13712.500000      NaN      NaN          NaN            NaN   
50%     27425.000000      NaN      NaN          NaN            NaN   
75%     41137.500000      NaN      NaN          NaN            NaN   
max     54850.000000      NaN      NaN          NaN            NaN   

       subreddit.nsfw   created_utc  \
count           54848  5.484800e+04   
unique              1           NaN   
top             False           NaN   
fre

In [41]:
print(decor_df.describe(include='all'))

                                                     name   brand  \
count                                               10163   10163   
unique                                               6222     227   
top     Multicolour Modern Plastic Stylish Non Ticking...  Random   
freq                                                  354     890   
mean                                                  NaN     NaN   
std                                                   NaN     NaN   
min                                                   NaN     NaN   
25%                                                   NaN     NaN   
50%                                                   NaN     NaN   
75%                                                   NaN     NaN   
max                                                   NaN     NaN   

              rating         price  actual price      discount  \
count   10163.000000  10163.000000  10163.000000  10163.000000   
unique           NaN           NaN     