In [2]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from dataset_prep import create_total_dataframe
from data_preprocessing import preprocess_text

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/darinaponomarova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darinaponomarova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
all_docx = ["data/2022_satellite.docx", "data/2021_satellite.docx"]
# all_docx = ["data/2020_satellite.docx"]

total_df = create_total_dataframe(all_docx)

  df['Authors'], df['Article_Name'] = df['Article_Name'].str.rsplit('.', 1).str


In [4]:
total_df['Processed_Text'] = total_df['Text'].apply(preprocess_text)

In [5]:
total_df.reset_index(inplace=True)
total_df.index = total_df.index + 1
total_df['Article_ID'] = total_df.index

In [6]:
# total_df.head()

In [7]:
file_path = 'data/my_dataset.csv'

# Save the DataFrame to a CSV file
total_df.to_csv(file_path, index=False)

___

In [8]:
total_df[total_df['Article_Name'] == 'Possibilities of Project Management Using Artificial Intelligence']

Unnamed: 0,index,Article_Name,Page,Authors,Text,Processed_Text,Article_ID
8,7,Possibilities of Project Management Using Artificial Intelligence,30,"Chyzhenko V., Kubiavka L.","Abstracts. The world is currently undergoing many changes as technology continues to evolve to improve people's lives. Technological progress has made things that seemed impossible in the last decade a part of modern life. All industries are using technology to improve efficiency and effectiveness while making management reliable, secure and interoperable. The integration of human capabilities and technological systems increases productivity and ensures continuous improvement. The combination of technology and human capabilities is also used for project management. Keywords: technological processes, artificial intelligence, project management, automation, integration of human capabilities and technical systems. Introduction. As the use of AI in organizations continues to grow and its benefits remain significant even during the COVID-19 pandemic, it is important for project managers to understand where AI can have the greatest impact. Better decision-making and more accurate foreca...",abstract world current undergo mani chang technolog continu evolv improv peopl live technolog progress made thing seem imposs last decad part modern life industri use technolog improv effici effect make manag reliabl secur interoper integr human capabl technolog system increas product ensur continu improv combin technolog human capabl also use project manag technolog process artifici intellig project manag autom integr human capabl technic system introduct use ai organ continu grow benefit remain signific even covid pandem import project manag understand ai greatest impact better decis make accur forecast project complet date one constant task project manag collect suffici accur date inform make inform decis abil artifici intellig process huge data set real time chang organiz mindset help bridg gap facilit decis make process abil connect differ type inform find problem peopl would otherwis miss real turn point project manag ai potenti detect real time anomali dataset alert manag pr...,8


In [9]:
# input text:
# input_text = "This ability to process massive datasets ensures that issues and dependencies that would otherwise go unnoticed are captured during modeling and improve the quality of the proposed mitigation plan."
input_text = "Resource management and capacity planning. Artificial intelligence systems enable project managers to better execute future projects through more efficient allocation of resources. As a result, organizations can anticipate needs and adapt schedules accordingly. The ideal AI system will assist project managers with capacity planning strategies that consider all relevant aspects of the project, including manpower, facilities, schedules, budgets and supplies. A byproduct of this advanced planning is a happier workforce as tasks are delegated based on past performance and overall capacity employee AI can help improve the processes used to manage people, including: calculation of the best distribution of resources and forecasting the deficit or surplus of resources; determination of the right skills for the right job; providing feedback on the implementation of the project; determination of special training for a specific employee; increasing the productivity of capital (reducing obstacles for employees can have a significant impact on the company's efficiency"
input_text = preprocess_text(input_text)

### Standard TF-IDF:

In [10]:
# vectorize:
vectorizer = TfidfVectorizer(smooth_idf=True)
# vectorizer = TfidfVectorizer(ngram_range=(1,5))

corpus_tfidf = vectorizer.fit_transform(total_df['Processed_Text'])

In [11]:
provided_text_tfidf = vectorizer.transform([input_text])
cosine_similarities = cosine_similarity(provided_text_tfidf, corpus_tfidf)

In [12]:
threshold = 0.5

for idx, similarity in enumerate(cosine_similarities[0]):
    if similarity > threshold:
        print(f"Document '{total_df.iloc[idx]['Article_Name']}' is similar with a score of {similarity}")

Document 'Possibilities of Project Management Using Artificial Intelligence' is similar with a score of 0.6318895571816717


### N-gram TF-IDF:

In [13]:
# vectorize:
vectorizer = TfidfVectorizer(ngram_range=(2,5))

corpus_tfidf = vectorizer.fit_transform(total_df['Processed_Text'])

In [14]:
provided_text_tfidf = vectorizer.transform([input_text])
cosine_similarities = cosine_similarity(provided_text_tfidf, corpus_tfidf)

In [15]:
threshold = 0.1

for idx, similarity in enumerate(cosine_similarities[0]):
    if similarity > threshold:
        print(f"Document '{total_df.iloc[idx]['Article_Name']}' is similar with a score of {similarity}")

Document 'Possibilities of Project Management Using Artificial Intelligence' is similar with a score of 0.40849840966272727


#### SequenceMatcher() test:

In [16]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

text_from_article = total_df[total_df['Article_Name'] == 'Possibilities of Project Management Using Artificial Intelligence']['Processed_Text'].iloc[0]
similarity_score = similarity(input_text, text_from_article)

print(f"Similarity score: {similarity_score}")

Similarity score: 0.2895303748384317


___