# Text Analysis

In [None]:
!pip install contractions
!pip install syllables

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

import re
import string
import contractions
import syllables
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Text Analysis/Content_file.csv', index_col='Unnamed: 0')
df.head()

In [None]:
df['Text_Content'].isna().value_counts()

In [None]:
# Removing the rows whose URL are not valid

df.dropna(inplace=True)
df.columns = df.columns.str.lower()

## Sentimental Analysis

### Cleaning using Stop Words Lists

In [None]:
# Instantiating WordNetLemmatizer

lem = WordNetLemmatizer()

In [None]:
# Creating a list of StopWords 
path = '/content/drive/MyDrive/Text Analysis/StopWords/'
stop_words = []

for fl in os.listdir(path):
  with open(path+fl, 'r', encoding='cp1252') as f:
    for w in f:
      stop_words.append(w.split()[0])

len(stop_words)

In [None]:
for x in stopwords.words('english'):
  if x not in stop_words:
    stop_words.append(x)

len(stop_words)

In [None]:
# Converting the Stopwords into lowercase words
stop_words = [w.lower() for w in stop_words]
len(stop_words)

In [None]:
# Function to pre-process the text data
def preprocess(s):
  s = s.lower()
  s = contractions.fix(s)
  s = re.sub('[^a-zA-Z]+', ' ', s).strip()
  tokens = word_tokenize(s)
  words = [token for token in tokens if token not in stop_words]
  lemma = [lem.lemmatize(word) for word in words]
  return lemma

In [None]:
# Apply the function on extracted textual data
preprocessed_data = [preprocess(i) for i in df['text_content']]
df['preprocessed_text'] = preprocessed_data

### Creating a dictionary of Positive and Negative words

In [None]:
# Reading the text files with Negative and Positive words
with open('/content/drive/MyDrive/Text Analysis/Master Dictionary/negative-words.txt', 'r', encoding='cp1252') as f:
  neg_words = f.read().split()

with open('/content/drive/MyDrive/Text Analysis/Master Dictionary/positive-words.txt', 'r', encoding='cp1252') as f:
  pos_words = f.read().split()

len(pos_words), len(neg_words)

In [None]:
# Creating a dictionary of Positive and Negative words
pos_neg_words = {'Positive': pos_words,
                 'Negtive': neg_words}

### Extracting Derived variables

#### Positive and Negtive Score

In [None]:
pos_score = []
neg_score = []
for i in preprocessed_data:
  pos = []
  neg = []
  for j in i:
    if j in pos_neg_words['Positive']:
      pos.append(j)
    if j in pos_neg_words['Negtive']:
      neg.append(j)
  pos_score.append(len(pos))
  neg_score.append(len(neg))

In [None]:
# Adding Positive and Negtive columns to the dataframe
df['positive'] = pos_score
df['negative'] = neg_score

#### Polarity Score

In [None]:
df['polarity'] = round((df['positive'] - df['negative'])/(df['positive'] + df['negative'] + 0.000001), 2)

#### Subjective Score

In [None]:
df['num_words'] = [len(x) for x in df['preprocessed_text']]

In [None]:
df['subject'] = round((df['positive']+df['negative'])/(df['num_words'] + 0.000001), 2)

In [None]:
df.sample(5)

##	Analysis of Readability

#### Average Sentence Length

In [None]:
df['num_sent'] = [len(sent_tokenize(sen)) for sen in df['text_content']]

In [None]:
df['avg_sen_len'] = round(df['num_words']/df['num_sent'], 2)
df.sample(5)

#### Percentage of Complex Words

In [None]:
def complex_words(sen):
  complexity = set()
  for w in sen:
    if syllables.estimate(w) > 2 and w[-2:] != 'ed' and w[-2:] != 'es':
      complexity.add(w)
  return len(complexity)


In [None]:
df['complex_words'] = [complex_words(w) for w in df['preprocessed_text']]

In [None]:
df['%age_complex_words'] = round(df['complex_words']/df['num_words'], 2)

#### Fog Index

In [None]:
df['fog_index'] = round( 0.4 * (df['avg_sen_len'] + df['%age_complex_words']), 2)

In [None]:
df.sample(3)

## Average Number of Words Per Sentence

In [None]:
df['avg_words_per_sen'] = round(df['num_words']/df['num_sent'], 2)

In [None]:
df.sample(3)

## Complex Word Count and Word Count

These is already calculated.

In [None]:
df.sample(5)

## Syllable Count Per Word

In [None]:
df['syl_count'] = [syllables.estimate(' '.join(w)) for w in df['preprocessed_text']]

In [None]:
df['syl_per_word'] = round(df['syl_count']/df['num_words'], 2)

In [None]:
df.sample(3)

## Personal Pronouns

In [None]:
def personal_pronouns(text):
  pronoun_sample = re.compile(r'\b(I|we|my|ours|(?-i:us))\b', re.I)
  pronouns = pronoun_sample.findall(text)
  return len(pronouns)

In [None]:
df['personal_pronouns'] = [personal_pronouns(sen) for sen in df['text_content']]

In [None]:
df.sample(3)

## Average Word Length

In [None]:
def text_len(text):
  filtered = ''.join(filter(lambda x: x not in string.punctuation, text))
  words = [word for word in filtered.split() if word]
  ch_len = 0
  for w in words:
    ch_len += len(w)
  avg = ch_len/len(words)
  return avg

In [None]:
df['avg_word_len'] = [text_len(text) for text in df['text_content']]

In [None]:
df.sample(3)

## Creating Output Structure

In [None]:
df = df[['url_id', 'url', 'positive', 'negative', 'polarity', 'subject', 'avg_sen_len', '%age_complex_words', 'fog_index', 'avg_words_per_sen', 'complex_words', 'num_words', 'syl_per_word', 'personal_pronouns', 'avg_word_len']]

In [None]:
df.columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [None]:
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/Text Analysis/Output Data.csv', index=False)