In [65]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [29]:
cwd = os.getcwd()
file_name = "MINDsmall_train/news.tsv"
file_path =  os.path.join(cwd, file_name)

In [30]:
news_df = pd.read_csv(file_path, sep='\t',header=None, names=['NewsID', 'Category', 'SubCategory', 'Title' , 'Abstract' , 'URL', 'TitleEntities', 'AbstractEntities'])

In [31]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [66]:
news_df.drop(columns=['URL', 'TitleEntities' ,'AbstractEntities' ], inplace=True)

In [67]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


In [59]:
import openai as OpenAI
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

Python-dotenv could not parse statement starting at line 2


In [60]:
openai.api_key = OPENAI_API_KEY
openai_client = openai.OpenAI()

In [61]:
from pinecone import Pinecone, ServerlessSpec

INDEX_NAME = 'pc-index-news'
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)

In [69]:
from sentence_transformers import SentenceTransformer


def get_embeddings(articles, model=SentenceTransformer('all-MiniLM-L6-v2')):
   return model.encode(articles)

In [90]:
sentences = ["This is an example sentence", "Each sentence is converted"]
get_embeddings(sentences)

array([[ 6.76568970e-02,  6.34959415e-02,  4.87130731e-02,
         7.93049261e-02,  3.74480411e-02,  2.65283999e-03,
         3.93749252e-02, -7.09848013e-03,  5.93613945e-02,
         3.15369926e-02,  6.00981116e-02, -5.29051982e-02,
         4.06067967e-02, -2.59308629e-02,  2.98428256e-02,
         1.12688157e-03,  7.35148713e-02, -5.03818206e-02,
        -1.22386612e-01,  2.37028431e-02,  2.97265705e-02,
         4.24768627e-02,  2.56337319e-02,  1.99515629e-03,
        -5.69190457e-02, -2.71598715e-02, -3.29035260e-02,
         6.60249069e-02,  1.19007207e-01, -4.58791517e-02,
        -7.26214275e-02, -3.25840414e-02,  5.23413494e-02,
         4.50553037e-02,  8.25302675e-03,  3.67024392e-02,
        -1.39415357e-02,  6.53918609e-02, -2.64272299e-02,
         2.06400960e-04, -1.36643723e-02, -3.62810530e-02,
        -1.95044074e-02, -2.89737955e-02,  3.94270197e-02,
        -8.84090513e-02,  2.62426981e-03,  1.36713330e-02,
         4.83062863e-02, -3.11566480e-02, -1.17329210e-0

In [74]:
chunks

<pandas.io.parsers.readers.TextFileReader at 0x14bebd610>

In [93]:
from tqdm.auto import tqdm, trange

CHUNK_SIZE=10
TOTAL_ROWS=100
progress_bar = tqdm(total=TOTAL_ROWS)
use_columns = [0,1,2,3,4]

chunks = pd.read_csv(file_path, sep='\t',header=None,usecols=use_columns , 
                     names=['NewsID', 'Category', 'SubCategory', 'Title' , 'Abstract' ]
                     , chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)



chunk_num = 0
for chunk in chunks:
    titles = chunk['Title'].tolist()
    print(titles)
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings[i],
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

100%|██████████| 100/100 [00:14<00:00,  6.81it/s]


['The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By', '50 Worst Habits For Belly Fat', "The Cost of Trump's Aid Freeze in the Trenches of Ukraine's War", "I Was An NBA Wife. Here's How It Affected My Mental Health.", 'How to Get Rid of Skin Tags, According to a Dermatologist', 'Should NFL be able to fine players for criticizing officiating?', "It's been Orlando's hottest October ever so far, but cooler temperatures on the way", 'Chile: Three die in supermarket fire amid protests', 'Best PS5 games: top PlayStation 5 titles to look forward to', 'How to report weather-related closings, delays']
['50 Foods You Should Never Eat, According to Health Experts', 'Trying to Make a Ram 3500 as Quick as a Viper Requires Some Disassembly', '25 Biggest Grocery Store Mistakes Making You Gain Weight', 'Instagram Filters with Plastic Surgery-Inspired Effects Could Soon Disappear', 'Michigan apple recall: Nearly 2,300 crates could be contaminated with listeria', "Kate Middleton's Be



['Elijah Cummings to lie in state at US Capitol Thursday', '17 Abandoned Theme Parks to Explore for Thrills, Chills, and Nostalgia', 'Ford Bronco Test Mule Spotted Flexing Its Muscles in Australia', "Trump's Trustbusters Bring Microsoft Lessons to Big Tech Fight", 'PGA Tour winners', '10 Things You Never Knew About Owls', '25 Last-Minute Ideas That Will Absolutely Save Your Holiday Dinner', 'Every outfit Duchess Kate has worn in 2019', 'New iPad Pro 2019 release date, price, news and leaks', 'These Simple Diet Changes Helped This Guy Lose 75 Pounds in 9 Months']
['Without help from US, UN climate fund struggles to meet goal', 'Best PS4 games 2019: play great PlayStation 4 games', 'The 23 Best TV Reboots of All Time', 'Netflix hackers: Why they steal passwords', '20 Must-Try Gravy Recipes for Your Thanksgiving Feast', 'A little snow causes a big mess, more than 100 crashes on Minnesota roads', 'Extinction Rebellion wins court challenge against London police', 'LA rivalry takes on new me



["23 Things Car Dealerships Don't Want You To Know", 'Locked On Suns Wednesday: Suns blast Kings to begin 2019-20 season', "TV shows we're most excited about for the rest of the year", 'The Absolute Best One-piece Bathing Suits of 2019', '40 Things I Wish I Could Tell My Younger Self', 'Why Is This Weird Chevy Colorado Test Mule Trolling Ford HQ?', '20 Words and Phrases You Had No Idea Were Coined in New York City', "All of Kevin Harvick's Monster Energy Series victories", 'PCC Community Markets Plans Its First Fast Casual Restaurant Inside New Ballard Store', 'Where to Find a Good, Cheap Sandwich in Every State']
['16 major chains where you can buy plant-based versions of your fast-food favorites', 'Do You Hate Meditating? So Do These People, But They Do It Anyway', "So far the World Series is one big reminder that you can't predict October baseball", '40 Celebrity Mothers and Daughters at the Same Age', 'The Top 50 Fast Food Restaurants in America', 'Take heart, millennials: Investin



['Redskins place TE Jordan Reed on injured reserve, ending his season', 'NFL Week 9 Odds: Point Spreads, Betting Lines For All 14 Football Games', '27 products you only need to buy once in your life', 'Ranking 26 fast-food fried-chicken sandwiches (including Popeyes)', '17 Surprising Ways Penny-Pinching Costs You More', "The Fascinating Story Behind Where 'Hocus Pocus' Was Filmed", 'Early symptoms of dementia: Be aware of subtle signs', 'Man killed in officer-involved shooting in St. Louis', '40 Etiquette Mistakes You Need to Stop Making by Age 40', 'Patriots vs. Ravens: 3 keys to the game, fun facts, prediction']


In [82]:
def subarray_sum(nums, k):
    sum_counts = {0: 1}
    cumulative_sum = 0
    count = 0

    for num in nums:
        print('***Start next itme***')
        cumulative_sum += num
        print(f'num is {num}')
        print(f'Cumulative Sum is now: {cumulative_sum}')
        
        if cumulative_sum - k in sum_counts:
            count += sum_counts[cumulative_sum - k]
            print(f'count updated to : {count}')
            
        
        if cumulative_sum in sum_counts:
            sum_counts[cumulative_sum] += 1            
        else:
            sum_counts[cumulative_sum] = 1
        
        print(f'sum_counts updated to : {sum_counts}')
            
    
    return count

In [83]:
subarray_sum([2, 2, 2], 4)

***Start next itme***
num is 2
Cumulative Sum is now: 2
sum_counts updated to : {0: 1, 2: 1}
***Start next itme***
num is 2
Cumulative Sum is now: 4
count updated to : 1
sum_counts updated to : {0: 1, 2: 1, 4: 1}
***Start next itme***
num is 2
Cumulative Sum is now: 6
count updated to : 2
sum_counts updated to : {0: 1, 2: 1, 4: 1, 6: 1}


2

False

In [87]:
p_dict = {'(':0 , '[':0, '{': 0 , ')':0 , ']':0 , '}':0}

In [89]:
p_dict['(']

0