# Week 1 Lab- Sujan Maharjan

## Text Preprocessing

Lower casing

In [22]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer


# Read CSV file
df = pd.read_csv('realdonaldtrump.csv')


In [23]:
# print content data 
df['content'].head(10)

0    Be sure to tune in and watch Donald Trump on L...
1    Donald Trump will be appearing on The View tom...
2    Donald Trump reads Top Ten Financial Tips on L...
3    New Blog Post: Celebrity Apprentice Finale and...
4    "My persona will never be that of a wallflower...
5    Miss USA Tara Conner will not be fired - "I've...
6    Listen to an interview with Donald Trump discu...
7    "Strive for wholeness and keep your sense of w...
8    Enter the "Think Like A Champion" signed book ...
9    "When the achiever achieves, it's not a platea...
Name: content, dtype: object

In [24]:
# Convert 'content' column to lowercase
df['content'] = df['content'].str.lower()
df['content'].head(10)

0    be sure to tune in and watch donald trump on l...
1    donald trump will be appearing on the view tom...
2    donald trump reads top ten financial tips on l...
3    new blog post: celebrity apprentice finale and...
4    "my persona will never be that of a wallflower...
5    miss usa tara conner will not be fired - "i've...
6    listen to an interview with donald trump discu...
7    "strive for wholeness and keep your sense of w...
8    enter the "think like a champion" signed book ...
9    "when the achiever achieves, it's not a platea...
Name: content, dtype: object

### Punctuation removal

In [25]:
import string

# Define function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

# Apply function to 'Comments' column
df['content'] = df['content'].apply(remove_punctuation)
df['content'].head(10)

0    be sure to tune in and watch donald trump on l...
1    donald trump will be appearing on the view tom...
2    donald trump reads top ten financial tips on l...
3    new blog post celebrity apprentice finale and ...
4    my persona will never be that of a wallflower ...
5    miss usa tara conner will not be fired  ive al...
6    listen to an interview with donald trump discu...
7    strive for wholeness and keep your sense of wo...
8    enter the think like a champion signed book an...
9    when the achiever achieves its not a plateau i...
Name: content, dtype: object

### Identifying and Remove Stop Words

In [None]:
from nltk.corpus import stopwords

# Download stopwords (run this line only once)
nltk.download('stopwords')

# Define function to remove stopwords
stop_words = set(stopwords.words(' donald trump'))
def remove_stopwords(text):
    filtered_text = " ".join(word for word in text.split() if word.lower() not in stop_words)
    return filtered_text

# Apply function to 'Comments' column
df['content'] = df['content'].apply(remove_stopwords)

df['content'].head(10)

### Tokenize Text in Words

In [None]:
from nltk.tokenize import word_tokenize

# Tokenize 'Text' column
df['content'] = df['content'].apply(word_tokenize)

df['content'].head(10)

### NLTK Word Stemming

In [None]:
# Create a stemmer object
stemmer = PorterStemmer()

# Define a function to apply stemming to a string
def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Apply stemming to the 'Text' column
df['Stemmed Text'] = df['content'].apply(stem_text)

df['Stemmed Text'].head(10)

### WordNetLemmatizer

In [None]:

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize words
def lemmatize_word(word):
    # Find the part of speech for the word
    pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
    # Lemmatize the word based on its part of speech
    if pos:
        return lemmatizer.lemmatize(word, pos)
    else:
        return lemmatizer.lemmatize(word)

# Define a function to get the part of speech for a word
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Lemmatize each word in the tweets
df['lemmatized_text'] = df['content'].apply(lambda x: ' '.join([lemmatize_word(word) for word in x.split()]))

df['lemmatized_text'].head(10)

Stemming and Lemmatization Difference

In [None]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a function to perform stemming on words
def stem_word(word):
    return stemmer.stem(word)

# Define a function to perform lemmatization on words
def lemmatize_word(word):
    # Find the part of speech for the word
    pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
    # Lemmatize the word based on its part of speech
    if pos:
        return lemmatizer.lemmatize(word, pos)
    else:
        return lemmatizer.lemmatize(word)

# Define a function to get the part of speech for a word
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Stem each word in the tweets
df['stemmed_text'] = df['content'].apply(lambda x: ' '.join([stem_word(word) for word in x.split()]))

# Lemmatize each word in the tweets
df['lemmatized_text'] = df['content'].apply(lambda x: ' '.join([lemmatize_word(word) for word in x.split()]))

# Compare stemmed and lemmatized text with original text
for i in range(10):
    print(f"Original text: {df['text'][i]}")
    print(f"Stemmed text: {df['stemmed_text'][i]}")
    print(f"Lemmatized text: {df['lemmatized_text'][i]}\n")


 Plotting Frequencies of Words

In [None]:
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

# Tokenize the tweets into words
words = []
for tweet in df['content']:
    words.extend(word_tokenize(tweet.lower()))

# Get the frequency distribution of words
freq_dist = nltk.FreqDist(words)

# Plot the frequency distribution of the top 20 words
plt.figure(figsize=(10, 5))
freq_dist.plot(20)

# Set the plot title and axis labels
plt.title('Frequency distribution of top 20 words')
plt.xlabel('Words')
plt.ylabel('Frequency')

# Display the plot
plt.show()
