# CROP PREDICTION SYSTEM (KRISHI VIKAS)

# TF-IDF Based Recommender
1. Represent crops in terms of bag of words
2. Represent crops in terms of most grown crop and least grown crop
3. Generate TF-IDF matrix for farmers most grown crop and least grown crop
4. Calculate cosine similarity between crop grown in highest land area and least land area 
5. Get the recommended crops 

**Describing parameters**:

*1. PATH_CROPS: specify the path where crop.csv is present*  <br/>
*2. GROWN_CROP: List of CROP_Ids GROWN by the user*  <br/>
*3. NO_PREDICTED_CROP: Refers to the number of predicted crop as a result*

In [1]:
PATH_CROPS="/Users/Dell/Music/apy.csv"
GROWN_CROP=[2,7,8,17,18,34]
NUM_PREDICTED_CROP=50

In [2]:
try:
    import numpy
    import pandas as pd
    import pickle as pk
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import re
    from nltk.stem.snowball import SnowballStemmer
    import nltk
    stemmer = SnowballStemmer("english")
except ImportError:
    print('You are missing some packages! ' \
          'We will try installing them before continuing!')
    !pip install "numpy" "pandas" "sklearn" "nltk"
    import numpy
    import pandas as pd
    import pickle as pk
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import re
    from nltk.stem.snowball import SnowballStemmer
    import nltk
    stemmer = SnowballStemmer("english")
    print('Done!')

## 1. Represent articles in terms of bag of words

1. Reading the csv file to get the State_Name, crop and Area
2. Remove punctuation marks and other symbols from each crop
3. Tokenize each crop
4. Stem token of every crop

In [4]:
grown_crop = pd.read_csv(PATH_CROPS)
grown_crop.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [5]:
#Select relevant columns and remove rows with missing values
grown_crop = grown_crop[['State_Name','District_Name','Crop_Year','Season','Crop','Area','Production']].dropna()
#articles is a list of all articles
crops = grown_crop['Crop'].tolist()
crops[0] #an uncleaned article

'Arecanut'

In [6]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]', ' ',document)       #remove punctuation marks and other symbols
    tokens = nltk.word_tokenize(document)              #Tokenize sentences
    cleaned_crops = ' '.join([stemmer.stem(item) for item in tokens])    #Stemming each token
    return cleaned_crops

In [7]:
cleaned_crops = list (map(clean_tokenize, crops[0:50]))
cleaned_crops  #a cleaned, tokenized and stemmed crops 

['arecanut',
 'other kharif puls',
 'rice',
 'banana',
 'cashewnut',
 'coconut',
 'dri ginger',
 'sugarcan',
 'sweet potato',
 'tapioca',
 'arecanut',
 'other kharif puls',
 'rice',
 'cashewnut',
 'coconut',
 'dri ginger',
 'sugarcan',
 'sweet potato',
 'rice',
 'arecanut',
 'banana',
 'black pepper',
 'cashewnut',
 'coconut',
 'dri chilli',
 'dri ginger',
 'sugarcan',
 'rice',
 'arecanut',
 'banana',
 'black pepper',
 'cashewnut',
 'coconut',
 'dri chilli',
 'dri ginger',
 'other oilse',
 'rice',
 'arecanut',
 'banana',
 'black pepper',
 'cashewnut',
 'coconut',
 'dri chilli',
 'dri ginger',
 'other oilse',
 'rice',
 'banana',
 'black pepper',
 'cashewnut',
 'coconut']

# 2. Represent states in terms of grown crops associated area


In [10]:
#Get user representation in terms of words associated with read articles
states_crops = ' '.join(cleaned_crops[i] for i in GROWN_CROP)

In [11]:
states_crops

'rice sugarcan sweet potato sweet potato rice dri ginger'

# 3. Generate TF-IDF matrix for states highest grown and least grown crop


In [20]:
#Generate tfidf matrix model for entire corpus
tfidf_matrix = TfidfVectorizer(stop_words='english', min_df=2)
crops_tfidf_matrix = tfidf_matrix.fit_transform(cleaned_crops)
crops_tfidf_matrix #tfidf vector of an article

<50x16 sparse matrix of type '<class 'numpy.float64'>'
	with 65 stored elements in Compressed Sparse Row format>

In [21]:
#Generate tfidf matrix model for read articles
states_crops_tfidf_vector = tfidf_matrix.transform([states_crops])
states_crops_tfidf_vector

<1x16 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [22]:
states_crops_tfidf_vector.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.20208139,  0.23204442,  0.        ,  0.        ,
         0.        ,  0.56653314,  0.        ,  0.44130601,  0.26200746,
         0.56653314]])

# 4. Calculate cosine similarity between most grown and least ungrown crops in a state 



In [23]:
crops_similarity_score=cosine_similarity(crops_tfidf_matrix, states_crops_tfidf_vector)

In [24]:
predicted_crops_id = crops_similarity_score.flatten().argsort()[::-1]

In [25]:
predicted_crops_id

array([17,  8, 18, 12,  2, 45, 36, 27, 43, 15, 25,  6, 34, 16, 26,  7, 24,
       33, 42, 10, 11, 13, 14, 19,  5,  4,  3,  1,  9, 49, 20, 21, 47, 46,
       44, 41, 40, 39, 38, 37, 35, 32, 31, 30, 29, 28, 48, 23, 22,  0], dtype=int64)

In [29]:
#Remove read articles from recommendations
final_predicted_crops_id = [article_id for article_id in predicted_crops_id 
                                 if article_id not in GROWN_CROP ][:NUM_PREDICTED_CROP]

# 5. Get the recommended crops 

In [31]:
final_predicted_crops_id

[12,
 45,
 36,
 27,
 43,
 15,
 25,
 6,
 16,
 26,
 24,
 33,
 42,
 10,
 11,
 13,
 14,
 19,
 5,
 4,
 3,
 1,
 9,
 49,
 20,
 21,
 47,
 46,
 44,
 41,
 40,
 39,
 38,
 37,
 35,
 32,
 31,
 30,
 29,
 28,
 48,
 23,
 22,
 0]

In [34]:
#Recommended Articles and their title
print ('Area Consumed by a crop')
print (grown_crop.loc[grown_crop['Production'].isin(GROWN_CROP)]['Crop'])
print ('\n')
print ('Predictions')
print (grown_crop.loc[grown_crop['Production'].isin(final_predicted_crops_id)]['Crop'])

Area Consumed by a crop
7                     Sugarcane
269                     Sesamum
270                        Urad
283                     Sesamum
309                        Urad
318                     Linseed
331                      Potato
341                        Gram
399                        Urad
406                     Linseed
413               Small millets
433                      Potato
455                    Soyabean
464                     Linseed
466           Moong(Green Gram)
524               Small millets
565                        Urad
614                    Turmeric
682         Other Kharif pulses
770                        Urad
785               Small millets
848               Small millets
876         Other Kharif pulses
931                   Cashewnut
947         Other Kharif pulses
978          Other  Rabi pulses
1040               Cotton(lint)
1057                    Linseed
1067                    Tobacco
1090                       Urad
                