In [267]:
# Importing libraries
import os
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_columns', 100)

In [268]:
# Loading the dataset
inp_data = pd.read_excel('./Sample Files/Input.xlsx')
inp_data.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [269]:
def extract_data_from_url(url):
    # Sending an HTTP GET request to the specified URL and retrieving the response
    response = requests.get(url)

    if response.status_code == 404:
        print(f"404 Error: Page not found for URL: {url}")
        return None, None

    # Created a BeautifulSoup object from the HTML content in the response using the 'html5lib' parser
    soup = BeautifulSoup(response.text, 'html5lib')

    # Extracting article title
    title = soup.find('h1', class_=['entry-title', 'tdb-title-text']).text

    text_body = ""  # initialized an empty string to store the article content

    # Selected all the possible classes which contain the article content
    possible_classes = ['td-post-content tagdiv-type', 'tdb-block-inner', 'td-fix-index']

    # Selecting all the div containers with required class
    content_div = soup.find_all('div', class_=possible_classes)

    # Selecting only those div containers which contain '<p>' tag-elements
    for content in content_div:
        if content.find_all('p'):
            paragraphs = content.find_all('p')

    # Extracting the article contents from each paragraph and concatenating them
    for para in paragraphs:
        text_body = text_body + para.text
    return title, text_body

In [270]:
content_title = {} # Initializing a dictionary for content title
content = {} # Initializing a dictionary for article content

In [271]:
# Store the extracted content title and content in corresponding dictionaries
for url_id, url in zip(inp_data['URL_ID'], inp_data['URL']):
    content_title[url_id], content[url_id] = extract_data_from_url(url)

404 Error: Page not found for URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
404 Error: Page not found for URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


In [274]:
# Removing '?' and '.' from the filename to avoid any errors
for url_id, title in content_title.items():
    if title:
        if title[-1] == '.':
            content_title[url_id] = title.replace(' ', '_').replace(':', '_').replace('?', '').replace('/', '')[:-1]
        else:
            content_title[url_id] = title.replace('?', '').replace(' ', '_').replace('/', '').replace(':', '_')

In [275]:
# Displaying the all the article titles
display(content_title)

{123.0: 'Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040',
 321.0: 'Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030',
 2345.0: 'Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030',
 4321.0: 'Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040',
 432.0: 'Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040',
 2893.8: 'Rise_of_Chatbots_and_its_impact_on_customer_support_by_the_year_2040',
 3355.6: 'Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030',
 3817.4: 'How_does_marketing_influence_businesses_and_consumers',
 4279.2: 'How_advertisement_increase_your_market_value',
 4741.0: 'Negative_effects_of_marketing_on_society',
 5202.8: 'How_advertisementmarketing_affects_business',
 5664.6: 'Rising_IT_cities_will_impact_the_economy,_environment,_infrastructure,_and_city_life_by_the_year_2035',
 6126.4: 'Rise_of_OTT_platform_and_its_impact_on_entertainment_industry_by_the_year_2030',
 6588.2: 'Rise_of_Electric_Vehicles_and_its_Impact_on_Liveliho

In [276]:
# Displaying the all the article contents
display(content)

{123.0: 'Telemedicine, the use of technology to diagnose and treat patients remotely, has been rising in recent years. With the advent of high-speed internet and improved video conferencing tools, healthcare providers are increasingly turning to telemedicine to provide care to patients in remote or underserved areas.Telemedicine, using technology to provide healthcare services remotely, has recently gained popularity. With advancements in communication and medical technology, it has become increasingly possible for doctors and patients to connect and interact from anywhere in the world. This has led to the rise of telemedicine, which has the potential to revolutionize the way healthcare is delivered.#TelemedicineThe increasing focus on preventative healthcare has also driven the rise of telemedicine. As more and more people become aware of the importance of staying healthy, they are looking for ways to prevent illness and maintain their health. Telemedicine allows people to monitor the

In [277]:
inp_data['NAME'] = content_title.values()

In [279]:
# Displaying the total number of web scraped files before and after updating
# by removing entries with missing or empty content titles.
print("Total web scraped files: {}".format(len(content)))
content_title, content = remove_none(content_title, content)
print("Total web scraped files after updating: {}".format(len(content)))

Total web scraped files: 114
Total web scraped files after updating: 112


In [293]:
# Function to save contents to .txt files
def save_file():
    path = "./ArticleContents/"
    for url_id, data in content.items():
        if data:
            try:
                save_to_txt(data, content_title[url_id], path)
            except Exception as e:
                print(f"Error saving id: {url_id} {content_title[url_id]}.txt - {e}")
                return False
    return True

save_file()

File saved successfully: ./ArticleContents/Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040.txt
File saved successfully: ./ArticleContents/Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030.txt
File saved successfully: ./ArticleContents/Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030.txt
File saved successfully: ./ArticleContents/Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040.txt
File saved successfully: ./ArticleContents/Rise_of_telemedicine_and_its_Impact_on_Livelihood_by_2040.txt
File saved successfully: ./ArticleContents/Rise_of_Chatbots_and_its_impact_on_customer_support_by_the_year_2040.txt
File saved successfully: ./ArticleContents/Rise_of_e-health_and_its_impact_on_humans_by_the_year_2030.txt
File saved successfully: ./ArticleContents/How_does_marketing_influence_businesses_and_consumers.txt
File saved successfully: ./ArticleContents/How_advertisement_increase_your_market_value.txt
File saved successfully: ./ArticleContents/Negative_effect

True

In [281]:
# Stored all the score metrics 
score_metrics = ["POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE","SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS","FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT","WORD COUNT",
                "SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"]

# Created a dataframe only with the metrics
metrics_df = pd.DataFrame(columns = score_metrics)

In [282]:
# Combine the input data DataFrame 'inp_data' and the metrics DataFrame 'metrics_df'
# by concatenating them horizontally along axis 1 to create a comprehensive 'text_analysis_df'
text_analysis_df = pd.concat([inp_data,metrics_df],axis = 1)

In [283]:
# Replaced the 'NaN' values with 0
text_analysis_df.fillna(0,inplace = True)

In [284]:
text_analysis_df.head()

Unnamed: 0,URL_ID,URL,NAME,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,Rise_of_telemedicine_and_its_Impact_on_Livelih...,0,0,0,0,0,0,0,0,0,0,0,0,0
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise_of_e-health_and_its_impact_on_humans_by_t...,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise_of_e-health_and_its_impact_on_humans_by_t...,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,Rise_of_telemedicine_and_its_Impact_on_Livelih...,0,0,0,0,0,0,0,0,0,0,0,0,0
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,Rise_of_telemedicine_and_its_Impact_on_Livelih...,0,0,0,0,0,0,0,0,0,0,0,0,0


In [294]:
# Load the 'StopWords' text file
def get_stop_words():
    stop_words_files = ['StopWords_Auditor', 'StopWords_Currencies', 'StopWords_DatesandNumbers', 'StopWords_Generic',
                        'StopWords_GenericLong', 'StopWords_Geographic', 'StopWords_Names']
    stop_words = []
    for files in stop_words_files:
        with open(f'.\\Sample Files\\StopWords\\{files}.txt', 'r') as file:
            context = file.readlines()
            for line in context:
                for word in line.split():
                    if word != '|':
                        stop_words.append(word)
    return stop_words

In [295]:
# Function to extract all positive words from the .txt file
def get_positive_words():
    try:
        filepath = '.\\Sample Files\\MasterDictionary\\positive-words.txt'
        positive_words = []
        with open(filepath, 'r') as file:
            word_lines = file.readlines()
            for word in word_lines:
                positive_words.append(word[:-2])
        return positive_words
    except Exception as e:
        return e

In [296]:
# Function to extract all negative words from the .txt file
def get_negative_words():
    try:
        filepath = '.\\Sample Files\\MasterDictionary\\negative-words.txt'
        negative_words = []
        with open(filepath, 'r') as file:
            word_lines = file.readlines()
            for word in word_lines:
                negative_words.append(word[:-2])
        return negative_words
    except Exception as e:
        return e

In [297]:
# Function to get the sentiment dictionary
def get_sentiment_dictionary(stop_words):
    try:
        dictionary = {'positive_words': [], 'negative_words': []}
        # Adding positive words first
        positive_words = get_positive_words()
        dictionary['positive_words'] = [word for word in positive_words if word not in stop_words]
        # Adding negative words next
        negative_words = get_negative_words()
        dictionary['negative_words'] = [word for word in negative_words if word not in stop_words]
    except Exception as e:
        return e
    return dictionary

In [298]:
# Get stop words
stop_words = get_stop_words()

# Create the sentiment dictionary
sentiment_dictionary = get_sentiment_dictionary(stop_words)

In [299]:
# Function to clean text files
def clean_files(text, stop_words):
    tokenized_words = word_tokenize(text)
    new_content = ' '.join([word for word in tokenized_words if word not in stop_words])
    return new_content

In [300]:
# Path to article content files
path = '.\\ArticleContents\\'
files = os.listdir(path)
for txt in files:
    filepath = os.path.join(path, txt)
    with open(filepath, 'r', encoding='utf-8') as file:
        txt_body = file.read().strip()
        new_content = clean_files(txt_body, stop_words)
        save_to_txt(new_content, txt[:-4], '.\\Updated_article_content\\')

File saved successfully: .\Updated_article_content\AI_in_healthcare_to_Improve_Patient_Outcomes.txt
File saved successfully: .\Updated_article_content\All_you_need_to_know_about_online_marketing.txt
File saved successfully: .\Updated_article_content\An_outlook_of_healthcare_by_the_year_2040,_and_how_it_will_impact_human_lives.txt
File saved successfully: .\Updated_article_content\Are_we_any_closer_to_preventing_a_nuclear_holocaust.txt
File saved successfully: .\Updated_article_content\Can_robots_tackle_late-life_loneliness.txt
File saved successfully: .\Updated_article_content\Can_You_Be_Great_Leader_Without_Technical_Expertise.txt
File saved successfully: .\Updated_article_content\Changing_landscape_and_emerging_trends_in_the_Indian_ITITeS_Industry.txt
File saved successfully: .\Updated_article_content\Continued_Demand_for_Sustainability.txt
File saved successfully: .\Updated_article_content\Contribution_of_handicrafts_(Visual_Arts_&_Literature)_in_the_Indian_economy.txt
File saved su

File saved successfully: .\Updated_article_content\Should_celebrities_be_allowed_to_join_politics.txt
File saved successfully: .\Updated_article_content\Travel_and_Tourism_Outlook.txt
File saved successfully: .\Updated_article_content\What_are_the_key_policies_that_will_mitigate_the_impacts_of_COVID-19_on_the_world_of_work.txt
File saved successfully: .\Updated_article_content\What_if_the_Creation_is_Taking_Over_the_Creator.txt
File saved successfully: .\Updated_article_content\What_is_the_chance_Homo_sapiens_will_survive_for_the_next_500_years.txt
File saved successfully: .\Updated_article_content\What_is_the_difference_between_Artificial_Intelligence,_Machine_Learning,_Statistics,_and_Data_Mining.txt
File saved successfully: .\Updated_article_content\What_is_the_future_of_mobile_apps.txt
File saved successfully: .\Updated_article_content\What_is_the_repercussion_of_the_environment_due_to_the_COVID-19_pandemic_situation.txt
File saved successfully: .\Updated_article_content\What_Jobs_

In [334]:
# Function to get the ID based on the name
def get_id(df, name):
    return df.loc[df['NAME'] == name, 'URL_ID'].tolist()

# Function to filter symbols
def filter_symbols(tokens):
    new_token = []
    stop_words = set(stopwords.words('english'))
    for word in tokens:
        if word.isalpha():
            if word not in stop_words:
                new_token.append(word)
    return new_token

# Functio to count number of syllables in each word
def count_syllables(word):
    vowels = 'aeiouAEIOU'
    syllable_count = 0
    for letter in word:
        if letter in vowels:
            syllable_count+=1
    return syllable_count

# Function to count characters in tokens
def char_length(tokens):
    count_char = 0
    for word in tokens:
        count_char += len(word)
    return count_char

# Function to count the number of complex words
def num_complex_words(text, syllable_threshold=2):
    words = text.split()
    complex_words = [word for word in words if count_syllables(word) > syllable_threshold]
    word_length = len(complex_words)
    return word_length

# Function to get tokenized word list
def get_tokenize_list(content):
    tokenized_words = word_tokenize(content)
    return tokenized_words

# Function to update the positive score
def update_positive_score(df, url_id, tkn_list, sen_dict):
    for word in tkn_list:
        if word in sen_dict['positive_words']:
            df.loc[df['URL_ID'] == url_id, 'POSITIVE SCORE'] += 1

# Function to update the negative score
def update_negative_score(df, url_id, tkn_list, sen_dict):
    for word in tkn_list:
        if word in sen_dict['negative_words']:
            df.loc[df['URL_ID'] == url_id, 'NEGATIVE SCORE'] += 1

# Function to update the polarity score
def update_polarity_score(df, url_id):
    try:
        Positive_score = df.loc[df['URL_ID'] == url_id, 'POSITIVE SCORE']
        Negative_score = df.loc[df['URL_ID'] == url_id, 'NEGATIVE SCORE']
        polarity_score = (Positive_score - Negative_score) / (Positive_score + Negative_score + 0.000001)
    except ZeroDivisionError:
        polarity_score = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id, 'POLARITY SCORE'] = polarity_score
    return polarity_score

# Function to update the subjectivity score
def update_subjectivity_score(df, url_id, word_length):
    try:
        Positive_score = df.loc[df['URL_ID'] == url_id, 'POSITIVE SCORE']
        Negative_score = df.loc[df['URL_ID'] == url_id, 'NEGATIVE SCORE']
        Total_Words_after_cleaning = word_length
        subjectivity_score = (Positive_score + Negative_score) / (Total_Words_after_cleaning + 0.000001)
    except ZeroDivisionError:
        subjectivity_score = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id, 'SUBJECTIVITY SCORE'] = subjectivity_score
    return subjectivity_score
    
# Function to update the average sentence per length    
def update_avg_sentence_length(df, url_id, word_length, line_num):
    try:
        average_sentence_length = word_length / line_num
    except ZeroDivisionError:
        average_sentence_length = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id, 'AVG SENTENCE LENGTH'] = average_sentence_length
    return average_sentence_length

# Function to update the percentage of complex words column
def update_complex_words_perc(df, url_id, word_length, line_num, num_complex):
    try:
        percentage_of_complex_words = num_complex / word_length
    except ZeroDivisionError:
        percentage_of_complex_words = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_of_complex_words
    return percentage_of_complex_words

# Function to update the FOG index column
def update_index_fog(df,url_id,word_length,line_num,num_complex):
    try:
        avg_sentence_length = word_length / line_num
        complex_perc = num_complex / word_length
        fog_index = 0.4 * (avg_sentence_length + complex_perc)
    except ZeroDivisionError:
        fog_index = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id,'FOG INDEX'] = fog_index
    return fog_index

# Function to update the average number of words per sentence column
def update_avg_words_per_sentence(df,url_id,word_length,line_num,num_complex):
    average_words_per_sentence = word_length / line_num
    df.loc[df['URL_ID'] == url_id,'AVG NUMBER OF WORDS PER SENTENCE'] = average_words_per_sentence
    return average_words_per_sentence

# Function to update the count of complex words column
def update_complex_word_count(df,url_id,complex_num):
    df.loc[df['URL_ID'] == url_id,'COMPLEX WORD COUNT'] = complex_num
    return complex_num

# Function to update the word count column
def update_word_count(df,url_id,tkn_list):
    filtered_list = filter_symbols(tkn_list)
    word_count = len(filtered_list)
    df.loc[df['URL_ID'] == url_id,'WORD COUNT'] = word_count
    return word_count

# Function to update the syllable count per word column
def update_syllable_per_word(df,url_id,tkn_list):
    vowels = 'aeiouAEIOU'
    syllable_count = 0
    for word in tkn_list:
        if not word.endswith(("es", "ed")):
            for letter in word:
                if letter in vowels:
                    syllable_count+=1
    df.loc[df['URL_ID'] == url_id,'SYLLABLE PER WORD'] = syllable_count
    return syllable_count

# Function to update the count of personal pronouns (I, we, my, ours, us) 
def update_personal_pronouns(df,url_id,text):
    pronoun_pattern = r'\b(I|we|my|ours|us)\b'
    pronoun_matches = re.findall(pronoun_pattern, text)
    filtered_pronouns = [pronoun for pronoun in pronoun_matches if pronoun != "US"]
    pronoun_count = len(filtered_pronouns)
    df.loc[df['URL_ID'] == url_id,'PERSONAL PRONOUNS'] = pronoun_count
    return pronoun_count

# Function to update the average word length column
def update_avg_words_length(df, url_id, word_length, char_length):
    try:
        avg_word_length = char_length / word_length
    except ZeroDivisionError:
        avg_word_length = 0  # Handle division by zero
    df.loc[df['URL_ID'] == url_id, 'AVG WORD LENGTH'] = avg_word_length
    return avg_word_length

In [336]:
def update_metrics(df, dictionary):
    # Define the path to the directory containing updated article content files
    path = '.\\Updated_article_content\\'
    
    # List all the files in the directory
    files = os.listdir(path)
    
    # Iterate through each file
    for txt in files:
        # Get the URL_ID for the article based on the filename
        url_id = get_id(df, txt[:-4])

        # Create the full file path
        filepath = os.path.join(path, txt)
        
        # Read the lines and the full text content from the file
        with open(filepath,'r',encoding='utf-8') as file:
            txt_lines = file.readlines()
            txt_body = [''.join(text) for text in txt_lines][0]
            
        # Tokenize the text content
        tnk_list = get_tokenize_list(txt_body)

        # Calculate various metrics based on the text content
        word_length = len(tnk_list)
        chr_length = char_length(tnk_list)
        num_lines = len(txt_lines) 
        num_complex = num_complex_words(txt_body)

        # Update the DataFrame with the calculated metrics
        for id in url_id:
            update_positive_score(df, id, tnk_list, dictionary)
            update_negative_score(df, id, tnk_list, dictionary)
            update_polarity_score(df, id)
            update_subjectivity_score(df, id, word_length)
            update_avg_sentence_length(df, id, word_length, num_lines)
            update_complex_words_perc(df, id, word_length, num_lines, num_complex)
            update_index_fog(df, id, word_length, num_lines, num_complex)
            update_avg_words_per_sentence(df, id, word_length, num_lines, num_complex)
            update_complex_word_count(df, id, num_complex)
            update_word_count(df, id, tnk_list)
            update_syllable_per_word(df, id, tnk_list)
            update_personal_pronouns(df, id,txt_body)
            update_avg_words_length(df, id, word_length, chr_length)
    return df
text_analysis_df = update_metrics(text_analysis_df,sentiment_dictionary)

In [332]:
path = '.\\Updated_article_content\\'

# List all the files in the directory
files = os.listdir(path)

# Iterate through each file
for txt in files:
    # Get the URL_ID for the article based on the filename
    url_id = get_id(text_analysis_df, txt[:-4])

    # Create the full file path
    filepath = os.path.join(path, txt)

    # Read the lines and the full text content from the file
    with open(filepath,'r',encoding='utf-8') as file:
        txt_lines = file.readlines()
        txt_body = [''.join(text) for text in txt_lines][0]

    # Tokenize the text content
    tnk_list = get_tokenize_list(txt_body)

    # Calculate various metrics based on the text content
    word_length = len(tnk_list)
    chr_length = char_length(tnk_list)
    num_lines = len(txt_lines) 
    num_complex = num_complex_words(txt_body)
    print(txt_body)

Introduction “ If kills 10 million people decades , highly infectious virus war . Not missiles microbes. ” Bill Gates ’ remarks conference 2014 , world avoided Ebola outbreak . When , unprecedented , invisible virus hit , met overwhelmed unprepared healthcare system oblivious population . This public health emergency demonstrated lack scientific consideration underlined alarming robust innovations health medical facilities . For past years , artificial intelligence proven tangible potential healthcare sectors , clinical practices , translational medical biomedical research.After case detected December 31st 2019 , program developed BlueDot alerted world pandemic . It quick realise ’ ability analyse large chunks data detecting patterns identifying tracking carriers virus.Many tracing apps tabs people infected prevent risk cross-infection algorithms track patterns extract features classify categorise them.So ? IBM Watson , sophisticated works cloud computing natural language processing , 

What ’ perfection ? Does person expect perfection oneself ? Or amounts ? Many times question mind deem perfect . But verify ? Let understand work individuals put task carpentry budget . All Rs . 5000 ( Assumed ) complete task . They ’ budget.Manish , Vinay , Sameer make stool . Manish understanding carpentry ; learns YouTube . Vinay carpentry lessons engineering , practical idea make stool . But Sameer Carpenter , total understanding material , process , final product.Let start Manish . Understand task jobs . Manish options . He order parts assemble order material required wood , glue , nails , work assembly final product . The difference lack expertise field.Vinay options practical experience carpentry classes workshop Engineering . Sameer advantage skill experience perfected time.All days task . Since Manish finds easy assemble stool parts procured carpenter ; talks carpenter orders 3 leg parts , 6 horizontal support bars , circular base sit top . Vinay thinks shape parts experience 

In [339]:
# Dropped column 'Name' from the DataFrame
text_analysis_df.drop(columns = ['NAME'],axis = 1,inplace = True)

In [344]:
text_analysis_df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,48,18,0.454545,0.083333,792,0.482323,316.992929,792,382,621,1610,0,6.435606
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,42,10,0.615385,0.072524,717,0.499303,286.999721,717,358,518,1467,0,6.3947
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,42,10,0.615385,0.072524,717,0.499303,286.999721,717,358,518,1467,0,6.3947
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,48,18,0.454545,0.083333,792,0.482323,316.992929,792,382,621,1610,0,6.435606
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,48,18,0.454545,0.083333,792,0.482323,316.992929,792,382,621,1610,0,6.435606


In [341]:
def save_to_csv(df, filename, path=''):
    try:
        # Ensure that the directory specified in 'path' exists or create it
        os.makedirs(path, exist_ok=True)
        
        # Create the full file path by joining the directory and filename with the .csv extension
        file_path = os.path.join(path, f'{filename}.csv')
        
        # Save the DataFrame to the CSV file, excluding the index column
        df.to_csv(file_path, index=False)
        
        # Return True to indicate a successful save operation
        return True
    except Exception as e:
        # Return an Exception object in case of an error during the saving process
        return e

In [342]:
def save_to_txt(content, filename, path=''):
    try:
        # Create the directory (if it doesn't exist) where the .txt file will be saved
        os.makedirs(path, exist_ok=True)
        
        # Create the full file path with the specified filename
        file_path = os.path.join(path, f'{filename}.txt')
        
        # Write the content to the .txt file with UTF-8 encoding
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
        
        # Check if the file exists after saving and print a success message
        if os.path.exists(file_path):
            print(f"File saved successfully: {file_path}")
            return True
        else:
            print(f"Error: File not found after saving: {file_path}")
            return False
    except Exception as e:
        print(f"Error {e} in file : {filename}")
        return False

In [343]:
save_to_csv(text_analysis_df,'Output Data Strucure','Output')

True