WORD TOKENIZER


In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

# Ensure nltk resources are available
nltk.download('punkt')
nltk.download('punkt_tab')

def load_and_tokenize(file_path, tokenize_by='word'):
    """
    Load a dataset and tokenize the text data based on the specified type.

    Parameters:
    file_path (str): Path to the CSV file containing the data.
    tokenize_by (str): Tokenization type ('word' or 'sentence'). Default is 'word'.

    Returns:
    pd.DataFrame: DataFrame with an additional column of tokenized text.
    """
    # Load the dataset
    data = pd.read_csv(file_path)

    # Check if 'review' column exists
    if 'review' not in data.columns:
        raise ValueError("The dataset must contain a 'review' column.")

    # Tokenization function based on type
    if tokenize_by == 'word':
        data['tokenized_review'] = data['review'].apply(word_tokenize)
    elif tokenize_by == 'sentence':
        data['tokenized_review'] = data['review'].apply(sent_tokenize)
    else:
        raise ValueError("Invalid tokenize_by value. Use 'word' or 'sentence'.")

    return data

# Example usage
file_path = '/content/drive/MyDrive/IMDB Dataset.csv'  # Replace with the actual file path
tokenize_by = 'word'  # Change to 'sentence' for sentence tokenization
data = load_and_tokenize(file_path, tokenize_by)

# Display the first few tokenized reviews
print(data[['review', 'tokenized_review']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                    tokenized_review  
0  [One, of, the, other, reviewers, has, mentione...  
1  [A, wonderful, little, production, ., <, br, /...  
2  [I, thought, this, was, a, wonderful, way, to,...  
3  [Basically, there, 's, a, family, where, a, li...  
4  [Petter, Mattei, 's, ``, Love, in, the, Time, ...  


SENTENCE TOKENIZER

In [6]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

# Ensure nltk resources are available
nltk.download('punkt')

def load_and_sentence_tokenize(file_path):
    """
    Load a dataset and perform sentence tokenization on the text data.

    Parameters:
    file_path (str): Path to the CSV file containing the data.

    Returns:
    pd.DataFrame: DataFrame with an additional column of sentence-tokenized text.
    """
    # Load the dataset
    data = pd.read_csv(file_path)

    # Check if 'review' column exists
    if 'review' not in data.columns:
        raise ValueError("The dataset must contain a 'review' column.")

    # Perform sentence tokenization
    data['sentence_tokenized_review'] = data['review'].apply(sent_tokenize)

    return data

# Example usage
file_path = '/content/drive/MyDrive/IMDB Dataset.csv'  # Replace with the actual file path
data = load_and_sentence_tokenize(file_path)

# Display the first few sentence-tokenized reviews
print(data[['review', 'sentence_tokenized_review']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                           sentence_tokenized_review  
0  [One of the other reviewers has mentioned that...  
1  [A wonderful little production., <br /><br />T...  
2  [I thought this was a wonderful way to spend t...  
3  [Basically there's a family where a little boy...  
4  [Petter Mattei's "Love in the Time of Money" i...  


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
