In [20]:
import pandas as pd
import os

## Data Loading

In [2]:
# Load the CSV file into a pandas DataFrame
file_path = "../datasets/imdb_data_ints.csv"  # Replace with your file's path if it's not in the current directory
imdb_data_df = pd.read_csv(file_path)

In [3]:
imdb_data_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49577,I thought this movie did a down right good job...,1
49578,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49579,I am a Catholic taught in parochial elementary...,0
49580,I'm going to have to disagree with the previou...,0


## Data Validation

In [4]:
# Get a description of the DataFrame
description = imdb_data_df.describe(include='all')  # Include all types of data
print("\nDescription of the DataFrame:")
print(description)


Description of the DataFrame:
                                                   review     sentiment
count                                               49582  49582.000000
unique                                              49582           NaN
top     No one expects the Star Trek movies to be high...           NaN
freq                                                    1           NaN
mean                                                  NaN      0.501876
std                                                   NaN      0.500002
min                                                   NaN      0.000000
25%                                                   NaN      0.000000
50%                                                   NaN      1.000000
75%                                                   NaN      1.000000
max                                                   NaN      1.000000


In [5]:
# Check the number of unique elements per column
unique_elements_per_column = imdb_data_df.nunique()
print("Number of unique elements per column:")
print(unique_elements_per_column)

Number of unique elements per column:
review       49582
sentiment        2
dtype: int64


In [6]:
# Check for null (NaN) values
null_values = imdb_data_df.isnull().sum()
print("Null values per column:")
print(null_values)

Null values per column:
review       0
sentiment    0
dtype: int64


In [7]:
# Check for duplicate rows
duplicate_rows = imdb_data_df.duplicated().sum()
print("\nNumber of duplicate rows:")
print(duplicate_rows)


Number of duplicate rows:
0


In [8]:
# Check data types of all columns
print("\nData types of each column:")
print(imdb_data_df.dtypes)


Data types of each column:
review       object
sentiment     int64
dtype: object


In [9]:
# Verify if all labels are binary (0 or 1) for the 'label' column
# Replace 'label' with the actual name of your column containing the labels
if 'sentiment' in imdb_data_df.columns:
    unique_labels = imdb_data_df['sentiment'].unique()
    print("\nUnique values in the 'sentiment' column:")
    print(unique_labels)
    if set(unique_labels).issubset({0, 1}):
        print("sentiments are binary (0 or 1).")
    else:
        print("sentiments are not binary! Unique values:", unique_labels)


Unique values in the 'sentiment' column:
[1 0]
sentiments are binary (0 or 1).


In [10]:
# Summary of basic information to check column names, non-null count, etc.
print("\nBasic info about the DataFrame:")
imdb_data_df.info()


Basic info about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49582 entries, 0 to 49581
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 774.8+ KB


In [11]:
# Final check: Ensure all rows are unique
if len(imdb_data_df) == len(imdb_data_df.drop_duplicates()):
    print("\nAll rows are unique.")
else:
    print("\nThere are duplicate rows in the dataset.")


All rows are unique.


In [12]:
# Check if all elements in the first column (assumed to be reviews) are unique
review_column_name = imdb_data_df.columns[0]  # Get the name of the first column
unique_reviews = imdb_data_df[review_column_name].is_unique

if unique_reviews:
    print(f"All elements in the '{review_column_name}' column are unique.")
else:
    print(f"There are duplicate reviews in the '{review_column_name}' column.")
    # Display duplicate reviews
    duplicate_reviews = imdb_data_df[imdb_data_df.duplicated(subset=[review_column_name])]
    print("\nDuplicate reviews:")
    print(duplicate_reviews)

All elements in the 'review' column are unique.


## Data Engineering

#### Generating IDs for each unique review

In [13]:
# Generate unique IDs based on the movie review column
review_column_name = imdb_data_df.columns[0]  # Assuming the first column is movie reviews
imdb_data_df['unique_id'] = imdb_data_df[review_column_name].apply(lambda x: hash(x))

In [14]:
# Move the 'unique_id' column to the beginning of the DataFrame
column_to_move = 'unique_id'
columns = [column_to_move] + [col for col in imdb_data_df if col != column_to_move]
imdb_data_df = imdb_data_df[columns]

In [15]:
# Display a sample of the DataFrame with the unique IDs
print("DataFrame with unique IDs based on reviews:")
imdb_data_df.head()

DataFrame with unique IDs based on reviews:


Unnamed: 0,unique_id,review,sentiment
0,-1776257223830691734,One of the other reviewers has mentioned that ...,1
1,-2308613510661184590,A wonderful little production. <br /><br />The...,1
2,8051669330003540499,I thought this was a wonderful way to spend ti...,1
3,2662031559311252467,Basically there's a family where a little boy ...,0
4,6305374558105916340,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [16]:
# Check the number of unique elements per column
unique_elements_per_column = imdb_data_df.nunique()
print("Number of unique elements per column:")
print(unique_elements_per_column)

Number of unique elements per column:
unique_id    49582
review       49582
sentiment        2
dtype: int64


In [17]:
# Replace 'sentiment' with the actual name of your label column
label_column_name = 'sentiment'

# Count the number of occurrences for each unique value in the label column
label_counts = imdb_data_df[label_column_name].value_counts()

print("Number of each binary label:")
print(label_counts)

Number of each binary label:
sentiment
1    24884
0    24698
Name: count, dtype: int64


In [21]:
folder_path = "../datasets"  # Replace with the actual path to your folder
file_name = "binary_imdb_data_with_ids.csv"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

#Save the DataFrame as a CSV file
file_path = f"{folder_path}/{file_name}"
imdb_data_df.to_csv(file_path, index=False)

print(f"DataFrame saved as a CSV file at: {file_path}")

DataFrame saved as a CSV file at: ../datasets/binary_imdb_data_with_ids.csv
