-
Notifications
You must be signed in to change notification settings - Fork 0
/
00_formatting_SPOT_abio_data_04.05.2021.R
1176 lines (1006 loc) · 48 KB
/
00_formatting_SPOT_abio_data_04.05.2021.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#Format SPOT abiotic data
#Format the following data: SPOT data from Liv (1_...UnderProtection in /OriginalFiles), NOAA BEUTI upwelling data, NOAA CUTI upwelling data, NOAA Bakun upwelling data, NOAA MEI data
#Ver. 02.2021
getwd()
setwd("/Users/cfletcherh1/Desktop/DataAnalysis/SPOT_16S_18S_eLSA/")
#PART I: Chemical and biological data (nutrients, bacterial prod.)-----
#1. Liv's SPOT Data-----
#Already removed some columns and rows that were hashed out using LibreOffice/ Excel
#Read it in
Liv_data <- read.csv("abio_data_unformatted/SPOT_Nutrients_Fuhrman_Wrigley_2000_2019_Formatted.csv", header=T, stringsAsFactors = F)
dim(Liv_data) #890 9
names(Liv_data)
#Adjust column Depth_m to match DepthBin
#Some days 5m is zero, some days 5, some days "?"
length(grep("5m", Liv_data$DepthBin)) #199
Liv_data$Depth_m[grep("5m", Liv_data$DepthBin)]=5
length(grep("DCM", Liv_data$DepthBin)) #179
Liv_data$Depth_m[grep("DCM", Liv_data$DepthBin)] #This is gonna take some doing
length(grep("150", Liv_data$DepthBin)) #177
Liv_data$Depth_m[grep("150m", Liv_data$DepthBin)]=150
length(grep("500", Liv_data$DepthBin)) #182
Liv_data$Depth_m[grep("500", Liv_data$DepthBin)]=500
length(grep("890", Liv_data$DepthBin)) #153 why?
Liv_data$Depth_m[grep("890", Liv_data$DepthBin)]=890
#Write this back out and manually fix DCM depths
(x=Liv_data, file="abio_data_unformatted/SPOT_nutrients_formatted_02.13.2021.csv", row.names=F, quote=F, sep=",")
#Write back in
liv_data=read.csv("abio_data_unformatted/B_SPOT_nutrients_formatted_02.13.2021.csv", header=T, stringsAsFactors = F)
rm(Liv_data)
#Paste together year and month
dim(liv_data)
names(liv_data)
#liv_data$Year_Month=paste(liv_data$Year, liv_data$Month, sep="_")
head(liv_data$Year_Month)
#Also unique sample IDs with year, date, month, and depthBin
liv_data$SampleID=paste(liv_data$Year, liv_data$Month, liv_data$Day, liv_data$DepthBin, sep=".")
head(liv_data$SampleID)
length(unique(liv_data$SampleID)) #890
#Set up "Sample ID" column #Oh forgot I did that
#liv_data$SampleID=paste(liv_data$Year, liv_data$Month, liv_data$Day, liv_data$DepthBin, sep=".")
liv_data$Year.Mo.Dep=paste(liv_data$Year, liv_data$Month, liv_data$DepthBin, sep=".")
length(liv_data$Year.Mo.Dep) #890
length(unique(liv_data$Year.Mo.Dep)) #860 #Crap, there are 30 duplicates
liv_data$SampleID[which(duplicated(liv_data$Year.Mo.Dep))] #Shoot, there are 30 days where sampling was done twice in the same month, for some reason
#Remove duplicate rows??
#Remove duplicated samples: double check their sample ID against the SPOT 16S data
proks_data=read.table("ModifiedFiles/2.SPOT_16S_no_chloro_counts.tsv", header=T, stringsAsFactors = F, sep=" ") #For some
dim(proks_data)
colnames(proks_data)[1:20]
DATES=colnames(proks_data)[2:ncol(proks_data)]
DATES
#Remove proks_data as a variable, because I think it's slowing R down
rm(list=ls(pattern="proks_data"))
#Gonna have a lot of missing samples if we go through all the rows in liv_data -- just do the duplicates
du_rows=c()
for(i in liv_data$Year.Mo.Dep[which(duplicated(liv_data$Year.Mo.Dep))]){
print(i)
du_rows=c(du_rows, grep(i, liv_data$Year.Mo.Dep))
print(liv_data$SampleID[grep(i, liv_data$Year.Mo.Dep)])
}
length(du_rows) #72
length(unique(du_rows)) #54
#Grep liv_data rows in SPOT DATES variable
#If the row is missing from SPOT data, remove the ID of that row from liv_data
exclude_rows=c()
for(i in du_rows){
#print(liv_data$SampleID[i])
if(liv_data$Month[i]<10){
ID=paste(liv_data$Year[i], paste("0", liv_data$Month[i], sep=""), liv_data$Day[i], liv_data$DepthBin[i], sep=".")
no=grep(ID, DATES)
#print(no)
if(length(no)<1){
exclude_rows=c(exclude_rows, i)
}
}else{
ID=liv_data$SampleID[i]
no=grep(ID, DATES)
#print(no)
if(length(no)<1){
exclude_rows=c(exclude_rows, i)
}
}
}
exclude_rows
length(unique(exclude_rows)) #26
#Manually pick which rows we want to keep
liv_data$SampleID[exclude_rows]
liv_data$SampleID[grep("2000.10", liv_data$SampleID)] #keep 2000.10.9, exclude 10.27
exclude_rows=exclude_rows[-grep("2000.10.9", liv_data$SampleID[exclude_rows])]
liv_data$SampleID[grep("2012.7", liv_data$SampleID)] #Keep 2012.7.9, get rid of 2012.7.21?
liv_data$SampleID[grep("2014.7", liv_data$SampleID)] #Get rid of both the 17th and the 23rd
liv_data$SampleID[grep("2014.8", liv_data$SampleID)] #Get rid of the 14th
liv_data$SampleID[grep("2015.10", liv_data$SampleID)] #Get rid of the 7th
#Subset liv_data accordingly
dim(liv_data)
liv_subs=liv_data[-exclude_rows,]
dim(liv_subs)
#Check that there are no more duplicates
liv_subs$SampleID[which(duplicated(liv_subs$Year.Mo.Dep))]
sort(liv_subs$SampleID[grep("2012.6", liv_subs$SampleID)]) #Pick 6.20.2012, not 6.13.2012
liv_subs=liv_subs[-grep("2012.6.12|2012.6.13", liv_subs$SampleID),]
liv_subs$SampleID[grep("2013.12", liv_subs$SampleID)] #Get rid of 12.19.2013
liv_subs$SampleID[grep("2013.12.19", liv_subs$SampleID)]
liv_subs=liv_subs[-grep("2013.12.19", liv_subs$SampleID),]
which(duplicated(liv_subs$Year.Mo.Dep)) #NONE, F YEAH
#Now skip down to #3, reading in the Leu data
#2. SPOT CTD data-----
#First step: Read in the 175 files, grep only the columns I want, write out files
directory <- getwd()
filenames=system(paste('ls ', directory, '/SPOT_PROCESSED_CTD_Data/CTD_files', sep=''), intern=TRUE)
for(i in c(grep("asc.txt", filenames))){
print(filenames[i])
temp=read.table(file=paste("SPOT_PROCESSED_CTD_Data/CTD_files/", filenames[i], sep=""), header=T, stringsAsFactors = F, sep="\t")
#Make new df
temp_df=as.data.frame(matrix(nrow=nrow(temp), ncol=5))
colnames(temp_df)=c("Name", "Depth", "Temp", "Sal", "Oxy")
#Fill it with variables by grepping out columns corresponding to the variables I want: Name, PrDM/DepSM/Depth, T090/CTDTMP, Sal00/Sal11, OxML, put them in a new df
temp_df$Name=strsplit(filenames[i], split=".", fixed=T)[[1]][1]
temp_df$Depth=temp[,grep("PrDM|Depth|DepSM", colnames(temp), ignore.case=T)[1]] #Just take whichever comes first
temp_df$Temp=temp[,grep("T090|CTDTMP", colnames(temp), ignore.case=T)[1]]
temp_df$Sal=temp[,grep("Sal00|CTDSAL", colnames(temp), ignore.case=T)[1]]
temp_df$Oxy=temp[,grep("Sbeox0mL|Oxsol|Oxsat", colnames(temp), ignore.case=T)[1]]
#Order columns
temp_df=temp_df[,order(colnames(temp_df))]
#Write out
(x=temp_df, file=(paste("SPOT_PROCESSED_CTD_Data/temp_CTD_files/", temp_df$Name[1], "_temp_df.tsv", sep="")), quote=F, row.names=F, sep="\t")
}
#OMG it worked!! :D :D :D
#Write a separate R script to loop through all abbreviated CTD files, labeled *_temp_df.tsv, and pull out SPOT depths within a meter of each depth. (See SPOT_PROCESSED_CTD_Data/formatted_CTD_files_all_dates"01_parsing_CTD_data_v_04.02.2021.R" & "02_combine_subsetted_data.sh")
#Concatenate all subsetted files into one file, concat_subsetted_CTD_files_04.02.21.tsv
#Copy/ paste this file into "abio_data_unformatted" and read it in!! (Also copy/ paste error file, don't read that in)
#Removing all other variables called "CTD_*"
rm(list=ls(pattern="CTD"))
ls(pattern="CTD")
#Read in concatenated file
CTD_data=read.table("abio_data_unformatted/concat_subsetted_CTD_files_04.02.21.tsv", header=T, stringsAsFactors = F, sep="\t")
dim(CTD_data) #955 8
names(CTD_data)
#Get rid of 174 times that the rownames appear
length(grep("Depth", CTD_data$Depth)) #174
CTD_data=CTD_data[-grep("Depth", CTD_data$Depth),]
dim(CTD_data) #781 8
names(CTD_data)
grep("Depth", CTD_data$Depth) #integer(0)
#Convert Depth, Oxy, Sal, Temp, Year, Month, Date to be numeric, not characters
CTD_data$Depth=as.numeric(CTD_data$Depth)
head(CTD_data$Depth)
CTD_data$Oxy=as.numeric(CTD_data$Oxy)
CTD_data$Sal=as.numeric(CTD_data$Sal)
head(CTD_data$Sal)
CTD_data$Temp=as.numeric(CTD_data$Temp)
head(CTD_data$Temp)
CTD_data$Year=as.numeric(CTD_data$Year)
tail(CTD_data$Year)
CTD_data$Month=as.numeric(CTD_data$Month)
head(CTD_data$Month)
CTD_data$Date=as.numeric(CTD_data$Date)
head(CTD_data$Date)
#Write in Depth_bin variable
CTD_data$DepthBin="NA"
length(which(CTD_data$Depth<6)) #171
#174, booya #Wait, actually there are a few samples at depth between 5-10m, just how shallow is the DCM??
sort(liv_data$Depth_m[which(liv_data$DepthBin=="DCM")]) #Ok so there are some dates where DCM was 7m. Hmm. 7m. Anything less than 6m shall be assigned to 5m depth bin
CTD_data$DepthBin[which(CTD_data$Depth<6)]="5m"
length(which(CTD_data$Depth<145 & CTD_data$Depth>6)) #161
CTD_data$DepthBin[which(CTD_data$Depth<145 & CTD_data$Depth>6)]="DCM"
length(which(CTD_data$Depth>145 & CTD_data$Depth<495)) #174
CTD_data$DepthBin[which(CTD_data$Depth>145 & CTD_data$Depth<495)]="150m"
length(which(CTD_data$Depth>495&CTD_data$Depth<880)) #159
CTD_data$DepthBin[which(CTD_data$Depth>495&CTD_data$Depth<880)]="500m"
length(which(CTD_data$Depth>880)) #116
CTD_data$DepthBin[which(CTD_data$Depth>880)]="890m"
#Double check number of rows
length(which(CTD_data$Depth<6)) + length(which(CTD_data$Depth<145 & CTD_data$Depth>6)) + length(which(CTD_data$Depth>145 & CTD_data$Depth<495)) + length(which(CTD_data$Depth>495&CTD_data$Depth<880)) + length(which(CTD_data$Depth>880)) == nrow(CTD_data) #TRUE
#Create SampleID
head(liv_data$SampleID, n=15) #I guess August 2020 is missing from CTD data
CTD_data$SampleID=paste(CTD_data$Year, CTD_data$Month, CTD_data$Date, CTD_data$DepthBin, sep=".")
head(CTD_data$SampleID)
#Is SampleID unique?
length(unique(CTD_data$SampleID)) #732
length(CTD_data$SampleID) #781 #Uh oh
CTD_data$SampleID[which(duplicated(CTD_data$SampleID))]
#Just look at the first one
grep(CTD_data$SampleID[which(duplicated(CTD_data$SampleID))][1], CTD_data$SampleID)
CTD_data[grep(CTD_data$SampleID[which(duplicated(CTD_data$SampleID))][1], CTD_data$SampleID),] #Identical
CTD_data[grep(CTD_data$SampleID[which(duplicated(CTD_data$SampleID))][49], CTD_data$SampleID),] #Not identical, A and B casts from the same day
for(i in which(duplicated(CTD_data$SampleID))){
print(grep(CTD_data$SampleID[i], CTD_data$SampleID))
#print(CTD_data$Name[grep(CTD_data$SampleID[i], CTD_data$SampleID)])
}
#Ok, take the last one out of each element #also set up a vector of rows to include to be sure we keep that one
exclude_rows=c()
include_rows=c()
for(i in which(duplicated(CTD_data$SampleID))){
exclude_rows=c(exclude_rows, grep(CTD_data$SampleID[i], CTD_data$SampleID)[-length(grep(CTD_data$SampleID[i], CTD_data$SampleID))])
include_rows=c(include_rows, grep(CTD_data$SampleID[i], CTD_data$SampleID)[length(grep(CTD_data$SampleID[i], CTD_data$SampleID))])
}
length(unique(exclude_rows))
sort(exclude_rows)
sort(include_rows)
#Subset CTD data to exclude the rows that are duplicated
CTD_subs=CTD_data[-exclude_rows,]
dim(CTD_subs) #732 10
dim(CTD_data)
nrow(CTD_data)-nrow(CTD_subs)==length(unique(exclude_rows)) #TRUE
length(unique(CTD_subs$SampleID)) #732
#Write dummy variables into liv_data for oxygen, temp, salinity
names(liv_data)
liv_data$Oxyen="NA"
liv_data$Salinity="NA"
liv_data$Temperature="NA"
#Write variables from CTD_subs into Liv_data
#Samples missing FROM liv_data
missing_samples=c()
for(i in c(1:nrow(CTD_subs))){
a=grep(CTD_subs$SampleID[i], liv_data$SampleID)
if(length(a)<1){
missing_samples=c(missing_samples, CTD_subs$SampleID[i])
}#else{
#print(a)
#liv_data[a,c(11:13)]=CTD_subs[i,c(3:5)]
#}
}
missing_samples #Most of the missing samples are from 150m
length(missing_samples)
length(grep("150m", missing_samples)) #Yep, 11/15 missing samples
#Go through those missing samples and check them out
for(i in missing_samples){
print(i)
a=strsplit(i, split=".", fixed=T)
b=paste(a[[1]][1], a[[1]][2], a[[1]][3], sep=".")
print(liv_data$SampleID[grep(b, liv_data$SampleID)])
}
#All data from 9/12/2018 is missing
#Guess the others are too
#Convert Liv data to be numeric in the oxy, temp, sal departments
head(liv_data$Oxyen)
names(liv_data)[grep("Oxy", names(liv_data))]="Oxygen" #Forgot the G, oops!
liv_data$Oxygen=as.numeric(liv_data$Oxygen)
head(liv_data$Oxygen)
liv_data$Salinity=as.numeric(liv_data$Salinity)
head(liv_data$Salinity)
liv_data$Temperature=as.numeric(liv_data$Temperature)
head(liv_data$Temperature)
length(which(is.na(liv_data$Temperature)))/nrow(liv_data) #Only 20% of temp data is missing
length(which(is.na(liv_data$Temperature[grep("5m", liv_data$DepthBin)])))/length(grep("5m", liv_data$DepthBin)) #22% of temp data at 5m is missing
length(which(is.na(liv_data$Temperature)|is.na(liv_data$Oxygen)|is.na(liv_data$Salinity)))/nrow(liv_data) #173
#Overall, 20% of data missing - is it the same dates??
which(is.na(liv_data$Temperature) & liv_data$Oxygen>1) #yep, integer(0)
#COULD try to go to cruise logs and fill in manually
#3. Bacterial production data------
#a. Leucine data-----
leu_data=read.csv("abio_data_unformatted/SPOT_bacterial_production_Leu_2000_2021_04.05.2021.csv", header=T, stringsAsFactors = F)
dim(leu_data) #974 4
head(leu_data)
#First, split apart the mo.yr column to generate month, year, date
strsplit(head(leu_data$mo.yr), split="/", fixed=T)
leu_data$Month=01
leu_data$Day=01
leu_data$Year=2000
#Overwrite with real data
for(i in c(1:nrow(leu_data))){
leu_data$Month[i]=strsplit(leu_data$mo.yr[i], split="/", fixed=T)[[1]][1]
leu_data$Day[i]=strsplit(leu_data$mo.yr[i], split="/", fixed=T)[[1]][2]
leu_data$Year[i]=strsplit(leu_data$mo.yr[i], split="/", fixed=T)[[1]][3]
}
sort(unique(leu_data$Year))
leu_data$mo.yr[which(is.na(leu_data$Year))] #WTF is 11/05? Eff that!
leu_data$mo.yr[which(leu_data$Year==19)] #January through April of 2019
leu_data$Year[which(leu_data$Year==19)]=2019
#Fix problematic months
leu_data$Month[which(leu_data$Month=="01")]=1
leu_data$Month[which(leu_data$Month=="02")]=2
leu_data$Month[which(leu_data$Month=="03")]=3
leu_data$Month[which(leu_data$Month=="04")]=4
sort(unique(leu_data$Month))
#Then paste them together with DepthCategory to create unique sample IDs
leu_data$SampleID=paste(leu_data$Year, leu_data$Month, leu_data$Day, leu_data$DepthCategory, sep=".")
length(leu_data$SampleID) #969
length(unique(leu_data$SampleID)) #969
which(duplicated(leu_data$SampleID))
leu_data[grep(leu_data$SampleID[which(duplicated(leu_data$SampleID))][1], leu_data$SampleID),] #Can remove those data
leu_data[grep(leu_data$SampleID[which(duplicated(leu_data$SampleID))][2], leu_data$SampleID),] #And those too
#And year.month.depth
#Subsetting data to remove problematic samples
leu_data=leu_data[-c(grep(leu_data$SampleID[which(duplicated(leu_data$SampleID))][1], leu_data$SampleID), grep(leu_data$SampleID[which(duplicated(leu_data$SampleID))][2], leu_data$SampleID), which(is.na(leu_data$Year))),]
dim(leu_data) #969 rows
length(unique(leu_data$SampleID)) #Why is this much longer than Liv's data? #We are including months that Liv doesn't have data for
#Sort the leu_data by year (I think it's currently ordered by least -> most productive)
dim(leu_data)
leu_data=leu_data[order(leu_data$SampleID),]
dim(leu_data)
tail(leu_data$SampleID)
#Remove months after end of Liv's dataset
tail(sort(liv_data$SampleID)) #Last sampling date is 2019.2.13
leu_data$SampleID[which(leu_data$Year==2019 & leu_data$Month>2)[1]]
#Subset data
leu_subs=leu_data[c(1:(which(leu_data$Year==2019 & leu_data$Month>2)[1]-1)),]
dim(leu_subs) #886 9
tail(leu_subs$SampleID)
#Figure out which rows are duplicates, and remove them
length(unique(leu_subs$SampleID))
nrow(leu_subs)
which(duplicated(leu_subs)) #None! :)
#Create year.month.depth
leu_subs$Year.Mo.Depth=paste(leu_subs$Year, leu_subs$Month, leu_subs$DepthCategory, sep=".")
length(unique(leu_subs$Year.Mo.Depth)) #870 #16 duplicates
nrow(leu_subs)
#Set up dummy variables for leucine production in Liv's data
names(liv_subs)
liv_subs$BactProd_Leu="NA"
#I THINK this is how you set it to be NA, not "NA"
liv_subs$BactProd_Leu=as.numeric(liv_subs$BactProd_Leu)
head(liv_subs$BactProd_Leu)
length(which(is.na(liv_subs$BactProd_Leu))) #860
nrow(liv_subs)
#Write leu data in based on year.month.depth -- days of sampling may vary
missing_samples=c() #Samples missing FROM Liv's data
for(i in c(1:nrow(leu_subs))){
a=grep(leu_subs$Year.Mo.Depth[i], liv_subs$Year.Mo.Dep)
#print(a)
if(length(a)<1){
missing_samples=c(missing_samples, leu_subs$Year.Mo.Depth[i])
}#else{
#liv_subs$BactProd_Leu[a]=leu_data$Average_cell.ml.day[i]
#}
}
missing_samples=sort(missing_samples)
length(missing_samples) #66
nrow(leu_subs)-nrow(liv_subs) #26
#So there are 40 samples that are truly missing
length(which(is.na(liv_subs$BactProd_Leu))) #47
liv_subs$SampleID[which(is.na(liv_subs$BactProd_Leu))]
#Check that they are really missing
for(i in missing_samples){
a=strsplit(i, split=".", fixed=T)
b=paste(a[[1]][1], a[[1]][2], sep=".")
print(i)
print(liv_subs$Year.Mo.Dep[grep(b, liv_subs$Year.Mo.Dep)])
}
#Some of these missing samples are from too late in 2019
#Fix: "2005.7.883", "2006.8.887"
unique(liv_subs$DepthBin)
unique(liv_subs$Depth_m)
grep("887|883", liv_subs$Year.Mo.Dep) #IDK what that's about.
#OKAY, it looks like they really are missing!! :) Moving on to Thy!!
#b. Thymidine data-----
#Read in data
thy_data=read.csv("abio_data_unformatted/SPOT_bacterial_production_Thy_2000_2021_04.06.21.csv", header=T, stringsAsFactors = F)
dim(thy_data)
names(thy_data)
head(thy_data)
#Break apart that first column to create year, month, date
#Oh god, do all the months have leading zeros? #Nope, it's a mix 8)
strsplit(thy_data$mo.yr, split="/", fixed=T)
thy_data$Year=2000
thy_data$Month=1
thy_data$Date=1
#Fixing error message from for-loop real quick:
thy_data$mo.yr[440]
thy_data=thy_data[-440,]
dim(thy_data)
#Overwrite with actual sampling dates
for(i in c(1:nrow(thy_data))){
#print(thy_data$mo.yr[i])
a=strsplit(thy_data$mo.yr[i], split="/", fixed=T)
if(a[[1]][3]<2000){
thy_data$Year[i]=paste(20,a[[1]][3], sep="")
}else{
thy_data$Year[i]=a[[1]][3]
}
thy_data$Date[i]=a[[1]][2]
thy_data$Month[i]=a[[1]][1]
}
#Fix problematic dates
sort(unique(thy_data$Year))
thy_data$Year[which(thy_data$Year==21)]=2021
sort(unique(thy_data$Month))
thy_data$Month[which(thy_data$Month=="01")]=1
thy_data$Month[which(thy_data$Month=="02")]=2
thy_data$Month[which(thy_data$Month=="03")]=3
thy_data$Month[which(thy_data$Month=="04")]=4
thy_data$Month[which(thy_data$Month=="05")]=5
thy_data$Month[which(thy_data$Month=="06")]=6
thy_data$Month[which(thy_data$Month=="07")]=7
thy_data$Month[which(thy_data$Month=="08")]=8
thy_data$Month[which(thy_data$Month=="09")]=9
sort(unique(thy_data$Month))
#Paste into sample ID and year.mo.depth
unique(thy_data$DepthBIN)
#Remove samples from "0", "7", and "Bucket"
thy_data$DepthBIN[grep("5|DCM|150|500|890", thy_data$DepthBIN, invert=T)]
dim(thy_data) #590 7
thy_data=thy_data[-grep("5|DCM|150|500|890", thy_data$DepthBIN, invert=T),]
dim(thy_data) #575 7
unique(thy_data$DepthBIN)
thy_data$SampleID=paste(thy_data$Year, thy_data$Month, thy_data$Date, thy_data$DepthBIN, sep=".")
nrow(thy_data) #575
length(unique(thy_data$SampleID)) #575
thy_data$Year.Mo.Dep=paste(thy_data$Year, thy_data$Month, thy_data$DepthBIN, sep=".")
length(unique(thy_data$Year.Mo.Dep)) #563 - 12 duplicate months
#Sort by date
dim(thy_data)
thy_data=thy_data[order(thy_data$SampleID),]
dim(thy_data)
#Remove dates after end of liv_data
tail(liv_data$SampleID)
dim(thy_data) #575 9
thy_data=thy_data[c(1:(which(thy_data$Year==2019 & thy_data$Month>2)[1]-1)),]
dim(thy_data) #531 9
tail(thy_data$SampleID)
grep("2020", thy_data$SampleID)
#Remove duplicates
length(unique(thy_data$SampleID)) #531
length(unique(thy_data$Year.Mo.Dep)) #519 #12 duplicates
#Figure out which rows are duplicates
du_rows=c()
for(i in thy_data$SampleID[which(duplicated(thy_data$Year.Mo.Dep))]){
#print(i)
a=strsplit(i, fixed=T, split=".")
b=paste(a[[1]][1], a[[1]][2], a[[1]][4], sep=".")
du_rows=c(du_rows, grep(b, thy_data$Year.Mo.Dep))
}
du_rows
length(unique(du_rows)) #24, as expected
du_rows=unique(du_rows)
#Figure out which of the duplicates has 16S data
thy_data$SampleID[du_rows]
#Actually, I can just subset based on removing duplicates from Liv's data (see above)
grep("2000.10.27", thy_data$SampleID[du_rows]) #Get rid of 2000.10.27
DATES[grep("2012", DATES)] #Executive decision, keep 5.30.2012, remove 5.3.2012
grep("2012.5.3", thy_data$SampleID[du_rows])
liv_subs$SampleID[grep("2012.6", liv_subs$SampleID)] #Keep 6.20.2012, remove 6.13.2012
grep("2012.6.13", thy_data$SampleID[du_rows])
liv_subs$SampleID[grep("2012.9", liv_subs$SampleID)] #Keep 9.28.2012, remove 9.21.2012
grep("2012.9.21", thy_data$SampleID[du_rows])
liv_subs$SampleID[grep("2014.7", liv_subs$SampleID)] #Keep the 17th, get rid of the 24th
grep("2014.7.24", thy_data$SampleID[du_rows])
liv_subs$SampleID[grep("2015.10", liv_subs$SampleID)] #Keep 10.20.2015, Zachary's birthday :)
grep("2015.10.07", thy_data$SampleID[du_rows])
liv_subs$SampleID[grep("2016.5", liv_subs$SampleID)] #LAST ONE! Keep 5.18.2016, remove 5.16.2016
grep("2016.5.16", thy_data$SampleID[du_rows])
exclude_rows=du_rows[c(grep("2000.10.27", thy_data$SampleID[du_rows]),
grep("2012.5.3", thy_data$SampleID[du_rows]),
grep("2012.6.13", thy_data$SampleID[du_rows]),
grep("2012.9.21", thy_data$SampleID[du_rows]),
grep("2014.7.24", thy_data$SampleID[du_rows]),
grep("2015.10.07", thy_data$SampleID[du_rows]),
grep("2016.5.16", thy_data$SampleID[du_rows]))]
du_rows
exclude_rows
length(du_rows)
length(exclude_rows)
thy_data$SampleID[du_rows]
thy_data$SampleID[exclude_rows] #All the dates I want to get rid of
#Subset thy_data
dim(thy_data) #531 9
thy_subs=thy_data[-exclude_rows,]
dim(thy_subs) #519 9
#Make sure rows really are all unique
nrow(thy_subs)
length(unique(thy_subs$Year.Mo.Dep)) #519 #good!
#Write in dummy variable in Liv's data
liv_subs$BactProd_Thy="NA"
liv_subs$BactProd_Thy=as.numeric(liv_subs$BactProd_Thy)
#Overwrite with actual data, based on Year.Mo.Dep
missing_thy=c() #thy data missing FROM liv's samples
for(i in c(1:nrow(thy_subs))){
#print(thy_subs$Year.Mo.Dep[i])
a=grep(thy_subs$Year.Mo.Dep[i], liv_subs$Year.Mo.Dep)
#print(a)
if(length(a)<1){
missing_thy=c(missing_thy, thy_subs$Year.Mo.Dep[i])
}else{
liv_subs$BactProd_Thy[a]=thy_subs$Average_cells.ml.day[i]
}
}
missing_thy=sort(missing_thy)
length(missing_thy) #36 #ok
length(which(is.na(liv_subs$BactProd_Thy))) #246
nrow(liv_subs) #860
nrow(liv_subs)-length(which(is.na(liv_subs$BactProd_Thy))) #614 rows have data??
length(which(liv_subs$BactProd_Thy>0)) #614 #But there are 100 fewer rows in thy data?
length(which(duplicated(liv_subs$BactProd_Thy))) #453
#Wait
length(which(liv_subs$BactProd_Thy>0 & liv_subs$DepthBin=="500m")) #168 #Boom
#All the 5m samples are hitting the 500m samples too
#Change "5" to "5m" on DepthBin, re-paste Year.Mo.Dep and re-run floop
unique(thy_subs$DepthBIN)
#thy_data$DepthBIN[which(thy_subs$DepthBIN==5)]="5m" #Oops!
thy_subs$DepthBIN[which(thy_subs$DepthBIN==5)]="5m"
thy_subs$Year.Mo.Dep=paste(thy_subs$Year, thy_subs$Month, thy_subs$DepthBIN, sep=".")
unique(thy_subs$DepthBIN)
length(unique(thy_subs$Year.Mo.Dep)) #519 #good
#Double-check again
length(which(is.na(liv_subs$BactProd_Thy))) #377
nrow(liv_subs)-length(which(is.na(liv_subs$BactProd_Thy))) #483
nrow(thy_subs)-length(missing_thy) #483
(nrow(liv_subs)-length(which(is.na(liv_subs$BactProd_Thy))))+length(missing_thy)==nrow(thy_subs)
#Length of liv_data that has bacterial production available + length missing samples
#Which of the samples missing from thy data are also missing from leu data?
for(i in missing_thy){
print(i)
print(grep(i, missing_samples))
}
#Most of them!
#4. Hammond data------
hammond_data=read.csv("abio_data_unformatted/Hammond_WIES_abio_data_formatted_01.22.21.csv", header=T, stringsAsFactors = F)
dim(hammond_data) #1053 14
head(hammond_data)
names(hammond_data)
#Figure out which depths correspond to SPOT depths
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==12)] #Take bottle 12 to be 5m
#DCM data not sampled -> need to linearly interpolate
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==5)] #Take bottle 5 to be 150m #Ranges from 100-150m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==3)] #Take bottle 3 to be 500m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==1)] #Take bottle 1 to be 890m
#depth.m and depth are the same except for these samples at 885m
hammond_data[which(hammond_data$Depth!=hammond_data$Depth.m),grep("Depth", names(hammond_data))]
#Ok to use interchangeably
#Samples collected in 2017 and 2016 are missing depth data
hammond_data$Year[which(hammond_data$Depth=="")]
length(which(hammond_data$Depth.m=="")) #137 samples are missing depth
nrow(hammond_data) #1053
sort(hammond_data$CTD.Bottle.No[which(hammond_data$Depth.m=="")])
#Fix missing depths
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==1 & hammond_data$Depth.m==2)]=885 #Bottle1=885m #Accidentally set to 2m :(
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==2 & hammond_data$Depth.m=="")]=750 #Bottle2=750m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==3 & hammond_data$Depth.m=="")]=500 #Bottle3=500m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==4 & hammond_data$Depth.m=="")]=250 #Bottle4=250m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==5 & hammond_data$Depth.m=="")]=150 #Bottle5=150m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==6 & hammond_data$Depth.m=="")]=60 #Bottle 6=60m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==7 & hammond_data$Depth.m=="")]=50 #Bottle 7=50m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==8 & hammond_data$Depth.m=="")]=40 #Bottle 8=40m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==9 & hammond_data$Depth.m=="")]=30 #Bottle 9=30m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==10 & hammond_data$Depth.m=="")]=20 #Bottle10=20m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==11 & hammond_data$Depth.m=="")]=10 #Bottle11=10m
hammond_data$Depth.m[which(hammond_data$CTD.Bottle.No==12 & hammond_data$Depth.m=="")]=2 #Bottle12=2m(
which(hammond_data$Depth.m=="") #integer(0)
which(is.na(hammond_data$Depth.m))
#Set unique IDs based on year, month, bottle number
head(paste(hammond_data$Year, hammond_data$Month, hammond_data$CTD.Bottle.No, sep="."))
hammond_data$SampleID=paste(hammond_data$Year, hammond_data$Month, hammond_data$CTD.Bottle.No, sep=".")
length(unique(hammond_data$SampleID)) #1034 #now 1053
nrow(hammond_data)
which(duplicated(hammond_data$SampleID)) #integer(0)
#There are ~20 duplicates <- remove em
hammond_data$SampleID[which(duplicated(hammond_data$SampleID))]
#There are X bottles from February 2016 that do not have a bottle number
View(hammond_data[which(hammond_data$Year==2016 & hammond_data$Month==2),])
#Ok fill in these bottle numbers
hammond_data$CTD.Bottle.No[which(hammond_data$Year==2016 & hammond_data$Month==2)]=c(12:5)
#Next problematic date is 11/2012
View(hammond_data[which(hammond_data$Year==2012 & hammond_data$Month==11),])
#Ok so there are two dates in this month: 28th and the 12th #Wait, looking at the original data, this is a typo
#They sampled on 11/28/2012 and 12/11/2012 #s#!t #Change the month
hammond_data$Month[which(hammond_data$Year==2012 & hammond_data$Month==11 & hammond_data$Day==12)]=12
#Reset sample IDs
#Write in a column for Year.Mo, important for grepping DCM
hammond_data$Year.Mo=paste(hammond_data$Year, hammond_data$Month, sep=".")
#Subset hammond_data to only take columns I want: depth.m, Si, fluor., Ph, etc
dim(hammond_data) #1053 17
grep("Depth.m|Si|Temp|Ph|Fluor|SampleID|Year.Mo", colnames(hammond_data))
hammond_subs=hammond_data[,grep("Depth.m|Si|Temp|Ph|Fluor|SampleID|Year.Mo", colnames(hammond_data))]
dim(hammond_subs) #1053 8
names(hammond_subs)
#Set up a new dataframe
sort(unique(paste(hammond_data$Year, hammond_data$Month, sep="."))) #from 10.2008-5.2018
length(sort(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")))) #91 unique sampling dates
names(hammond_data) #WANT: Si, pH, Fluorescence
#Really going to linearly interpolate across just 3 data points? Ok.
int_hammond=as.data.frame(matrix(ncol=1, nrow=(92*13))) #12 bottles + 1 for DCM
dim(int_hammond)
rep(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")), each=13)
c(12:9, "DCM", 8:1)
length(paste(rep(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")), each=13), c(12:9, "DCM", 8:1), sep="."))
#rownames(int_hammond)=paste(rep(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")), each=13), c(12:9, "DCM", 8:1), sep=".")
#colnames(int_hammond)=c("depth", "si", "pH", "fluorescence")
#Set up SampleID and Year.Mo
dim(int_hammond) #1196 1
names(int_hammond)=c("SampleID", "Year.Mo")
int_hammond$SampleID=paste(rep(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")), each=13), c(12:9, "DCM", 8:1), sep=".")
int_hammond$Year.Mo=paste(rep(unique(paste(hammond_data$Year, hammond_data$Month, sep=".")), each=13))
length(unique(int_hammond$SampleID)) #1196
length(unique(int_hammond$Year.Mo)) #92 #ok
#Write in depth data with tidyverse
library(dplyr)
hammond_fulljoin=full_join(as.data.frame(int_hammond), as.data.frame(hammond_subs), by="SampleID")
dim(hammond_fulljoin) #1196 9
#Write in DCM depth
head(hammond_fulljoin$SampleID, n=20)
hammond_fulljoin$SampleID[which(is.na(hammond_fulljoin$Depth.m))] #Why are some of these not "DCM"??
length(grep("DCM", hammond_fulljoin$SampleID))
length(which(is.na(hammond_fulljoin$Depth.m))) #143
hammond_fulljoin$Depth.m[which(is.na(hammond_fulljoin$Depth.m))]
View(hammond_fulljoin[which(is.na(hammond_fulljoin$Depth.m)),])
nrow(int_hammond)-nrow(hammond_subs) #143, that looks ok
#Checking a specific sample
hammond_fulljoin[grep("2017.9.4",hammond_fulljoin$SampleID),] #Ok, no data
grep("2017.9.4", hammond_subs$SampleID) #integer(0)
#HMMM
for(i in hammond_fulljoin$SampleID[which(is.na(hammond_fulljoin$Depth.m))][grep("DCM", hammond_fulljoin$SampleID[which(is.na(hammond_fulljoin$Depth.m))], invert=T)]){
print(grep(i, hammond_subs$SampleID))
} #a couple of samples are wrong - they are missing from hammond_fulljoin when they shouldn't be #There are 4 missing
hammond_subs$SampleID[962:964]
hammond_fulljoin[grep("2017.9.12", hammond_fulljoin$SampleID),]
#Check to see if data are all similar
sort(hammond_fulljoin$Temperature.C[which(as.numeric(hammond_fulljoin$Depth.m)<10)])
sort(hammond_fulljoin$Si.um[which(as.numeric(hammond_fulljoin$Depth.m)<10)])
#Ok write in DCM depth with a for-loop
for(i in grep("DCM", hammond_fulljoin$SampleID)){
#print(hammond_fulljoin$SampleID[i])
#print(hammond_fulljoin$Depth.m[i])
a=grep(hammond_fulljoin$SampleID[i], liv_subs$Year.Mo.Dep)
#print(a)
if(length(a)<1){
print(hammond_fulljoin$SampleID[i])
hammond_fulljoin$Depth.m[i]=NA
}else{
hammond_fulljoin$Depth.m[i]=liv_subs$Depth_m[a]
}
#hammond_fulljoin$Depth.m[i]=liv_subs$Depth_m[grep(hammond_fulljoin$SampleID[i], liv_subs$Year.Mo.Dep)]
}
#There are 3 dates with no DCM data in liv_subs: 2010.1, 2010.3, 2010.4
sort(hammond_fulljoin$Depth.m[grep("DCM", hammond_fulljoin$SampleID)]) #Ok they are showing up as characters #But ok
#Write out hammond_fulljoin data, use LibraOffice to sort by date, then by depth #also remove second year.mo column, not sure how that got in there
setwd("/Users/cfletcherh1/Desktop/DataAnalysis/SPOT_16S_18S_eLSA/")
(file="abio_data_unformatted/temp_hammond_data_for_interpolation_04.20.2021.tsv", x=hammond_fulljoin, quote=F, row.names = F, sep="\t")
#Read back in
hammond=read.table("abio_data_unformatted/temp_hammond_data_for_interpolation_04.20.2021.tsv", header=T, stringsAsFactors = F, sep="\t")
dim(hammond) #1196 9
head(hammond, n=20)
unique(hammond$Year.Mo.x) #wait #why TF have they put a 0 at the end of the months <12??
hammond$SampleID[grep(".10.", hammond$SampleID)]
#Ok, re-write depth (bottle #), year, month based on SampleID
hammond$Year=2008
hammond$Month=1
hammond$CTDNo=12
for(i in c(1:nrow(hammond))){
a=strsplit(hammond$SampleID[i], split=".", fixed=T)
hammond$Year[i]=a[[1]][1]
hammond$Month[i]=a[[1]][2]
hammond$CTDNo[i]=a[[1]][3]
}
hammond$Year.Mo=paste(hammond$Year, hammond$Month, sep=".")
#Double check
for(i in sample(x=c(1:nrow(hammond)), size=12)){
print(hammond[i, grep("Sample|Year|Month|CTD", colnames(hammond))])
}
#Subset out fake Year.Mo data
dim(hammond) #9 13
grep("Year.Mo.x|Year.Mo.y", names(hammond))
hammond=hammond[,-grep("Year.Mo.x|Year.Mo.y", names(hammond))]
dim(hammond) #1196 11
#For each DCM depth take the weighted average of one row above and below DCM depth
#For Si, temp (funsies), Ph, Fluor
for(i in grep("DCM", hammond$SampleID)){
print(hammond$SampleID[i])
#print(c(hammond$Depth.m[i-1], hammond$Depth.m[i+1]))
#print(hammond$Depth.m[i])
#print(c(hammond$Si.um[i-1], hammond$Si.um[i+1]))
#Si
hammond$Si.um[i]=weighted.mean(x=c(as.numeric(hammond$Si.um[i-1]), as.numeric(hammond$Si.um[i+1])), w=c(as.numeric(hammond$Depth.m[i-1]), as.numeric(hammond$Depth.m[i+1])))
#Temp
hammond$Temperature.C=weighted.mean(x=c(as.numeric(hammond$Temperature.C[i-1]), as.numeric(hammond$Temperature.C[i+1])), w=c(as.numeric(hammond$Depth.m[i-1]), as.numeric(hammond$Depth.m[i+1])))
#Ph
hammond$Ph=weighted.mean(x=c(as.numeric(hammond$Ph[i-1]), as.numeric(hammond$Ph[i+1])), w=c(as.numeric(hammond$Depth.m[i-1]), as.numeric(hammond$Depth.m[i+1])))
#Fluorescence
hammond$Fluorescence.mg.per.cubic.m=weighted.mean(x=c(as.numeric(hammond$Fluorescence.mg.per.cubic.m[i-1]), as.numeric(hammond$Fluorescence.mg.per.cubic.m[i+1])), w=c(as.numeric(hammond$Depth.m[i-1]), as.numeric(hammond$Depth.m[i+1])))
}
#FANTASTIC!
#Write out this temporary file
(hammond, file="abio_data_unformatted/temp_hammond_data_interpolated_04.20.2021.tsv", quote=F, row.names=F, sep="\t")
#Write into liv_subs
names(liv_subs)
head(liv_subs$Year.Mo.Dep)
#Write in a column for SPOT depths
hammond$SPOT.Dep="NA"
#Bottle 12=5m
length(which(hammond$CTDNo==12))
hammond$SPOT.Dep[which(hammond$CTDNo==12)]="5m"
hammond$Depth.m[which(hammond$CTDNo==12)]
#Bottle DCM= ...DCM?...
hammond$Depth.m[which(hammond$CTDNo=="DCM")]
length(which(hammond$CTDNo=="DCM")) #92
hammond$SPOT.Dep[which(hammond$CTDNo=="DCM")]="DCM"
#Bottle 5=150m
hammond$Depth.m[which(hammond$CTDNo==5)]
length(which(hammond$CTDNo==5)) #92
hammond$SPOT.Dep[which(hammond$CTDNo==5)]="150m" #Ranges by as much as 50m
#Bottle3=500m
length(which(hammond$CTDNo==3)) #92
hammond$Depth.m[which(hammond$CTDNo==3)]
hammond$SPOT.Dep[which(hammond$CTDNo==3)]="500m"
#Bottle1=890m
length(which(hammond$CTDNo==1))
hammond$Depth.m[which(hammond$CTDNo==1)]
hammond$SPOT.Dep[which(hammond$CTDNo==1)]="890m"
#Make new Sample IDs based on SPOT Dep
hammond$Year.Mo.SPOTDep=paste(hammond$Year, hammond$Month, hammond$SPOT.Dep, sep=".")
#Subset Hammond data
length(which(hammond$SPOT.Dep!="NA")) #460
dim(hammond) #1196 13
subs_hammond=hammond[which(hammond$SPOT.Dep!="NA"),]
dim(subs_hammond) #460 13
which(subs_hammond$SPOT.Dep=="NA") #of course this means I removed some real dates too
length(grep("150m", subs_hammond$SPOT.Dep))
#Write fake columns into liv_subs
liv_subs$Si=as.numeric("NA")
liv_subs$pH=as.numeric("NA")
liv_subs$Fluorescence.mg.per.cubic.m=as.numeric("NA")
#WRITE INTO LIV DATA
missing_samples=c()
for(i in 1:nrow(subs_hammond)){
a=grep(subs_hammond$Year.Mo.SPOTDep[i], liv_subs$Year.Mo.Dep)
if(length(a)<1){
missing_samples=c(missing_samples,subs_hammond$Year.Mo.SPOTDep[i])
}else{
liv_subs$Si[a]=subs_hammond$Si.um[i]
liv_subs$pH[a]=subs_hammond$Ph[i]
liv_subs$Fluorescence.mg.per.cubic.m[a]=subs_hammond$Fluorescence.mg.per.cubic.m[i]
}
}
missing_samples #AWESOME!! Yeah, that looks right
#Re-order columns in liv_subs, and also get rid of those we don't need
names(liv_subs)
names(liv_subs)[c(1:5, grep("SampleID", colnames(liv_subs)),6,7,9,8,11:13,17:19,14,16)]
dim(liv_subs) #860 19
liv_FINAL=liv_subs[,c(1:5, grep("SampleID", colnames(liv_subs)),6,7,9,8,11:13,17:19,14,16)]
dim(liv_FINAL) #860 18
names(liv_FINAL)
#WRITE IT OUT!!
(liv_FINAL, "ModifiedFiles/SPOT_nutrient_CTD_prod_Hammond_data_04.20.2021.tsv", quote=F, row.names=F, sep="\t")
#FUCK YEAH!!!!
#PART II: Physical oceanography data (upwelling indices, MEI)-----
#5. CUTI and BEUTI data-----
#a. CUTI data
CUTI_data=read.csv("abio_data_unformatted/NOAA_CUTI_monthly.csv", header=T, stringsAsFactors = F)
dim(CUTI_data)
names(CUTI_data) #Want 33N and want data from SPOT years only
#Create a column for year_month to match that of Liv's data
CUTI_data$Year_Month=paste(CUTI_data$year, CUTI_data$month, sep="_")
head(CUTI_data$Year_Month)
#b. BEUTI data
BEUTI_data=read.csv("abio_data_unformatted/NOAA_BEUTI_monthly.csv", header=T, stringsAsFactors = F)
dim(BEUTI_data)
dim(CUTI_data)
BEUTI_data$Year_Month=paste(BEUTI_data$year, BEUTI_data$month, sep="_")
names(CUTI_data)==names(BEUTI_data) #All TRUE
#6. MEI data-----
MEI_data=read.csv("abio_data_unformatted/NOAA_MEI_Formatted.csv", header=T, stringsAsFactors = F)
dim(MEI_data) #42 13
names(MEI_data)
MEI_data$YEAR
#Gonna have to make this into a list with year_month, MEI ix
head(list(MEI_data))
MEI_list_1=as.list(as.numeric(t(as.matrix(MEI_data[,2:13])))) #Col 1= YEAR, remove that
head(MEI_list_1, n=24) #It goes down by row, not column
MEI_data[1:2,]
length(MEI_list_1)
MEI_list_2=c()
for(i in c(1:length(MEI_list_1))){
MEI_list_2=c(MEI_list_2, MEI_list_1[[i]][1])
}
length(MEI_list_2) #504
head(MEI_list_2, n=24)
MEI_data[1:2,]
#Now get a vector of dates up (month-year)
rep(MEI_data$YEAR[1]:MEI_data$YEAR[nrow(MEI_data)], each=12)
rep(1:12, times=length(MEI_data$YEAR))
MEI_yr_mo=paste(rep(MEI_data$YEAR[1]:MEI_data$YEAR[nrow(MEI_data)], each=12), rep(1:12, times=length(MEI_data$YEAR)), sep="_")
length(MEI_yr_mo) #504
length(unique(MEI_yr_mo)) #504
length(unique(MEI_yr_mo))==length(MEI_list_2) #TRUE
#Write into a new matrix/df
MEI_df=as.data.frame(matrix(ncol=2, nrow=504))
colnames(MEI_df)=c("Year_Month", "MEI_ix")
MEI_df$Year_Month=MEI_yr_mo
MEI_df$MEI_ix=MEI_list_2
#Check it over
head(MEI_data)
head(MEI_df, n=24)
MEI_df[sample(x=c(1:nrow(MEI_df)), size=10, replace=F),]
View(MEI_data)
#7. Bakun upwelling data-----
bakun_data=read.csv("abio_data_unformatted/NOAA_BakunUpwellingData_200_2018_33N_241E.csv", stringsAsFactors = F, header = T)
dim(bakun_data) #885 12
head(bakun_data)
#Get rid of the first row, which only has the units for each measurement, not actual data
#But store it
meas=bakun_data[1,]
bakun_data=bakun_data[-1,]
dim(bakun_data) #884 12
#Pull out only the lat and longs I am interested in
unique(bakun_data$latitude) #Want 33.5N
unique(bakun_data$longitude) #Want 241.5E
length(which(bakun_data$latitude==33.5 & bakun_data$longitude==241.5)) #221
bakun_subs=bakun_data[which(bakun_data$latitude==33.5 & bakun_data$longitude==241.5),]
dim(bakun_subs) #221 12
#Re-format "time" to be month-year in the same format as CUTI and BEUIT data
#So no leading zeros
mo_yr <- c()
for(i in c(1:nrow(bakun_subs))){
a <- strsplit(bakun_subs$time[i], split="-", fixed=T)
if(as.numeric(a[[1]][2])<10){
b <- strsplit(a[[1]][2], split="0", fixed=T)
mo_yr=c(mo_yr, paste(a[[1]][1], b[[1]][2], sep="_"))
} else{
#print("duck")
mo_yr <- c(mo_yr, paste(a[[1]][1], a[[1]][2], sep="_"))
}
}
length(mo_yr) #221
length(unique(mo_yr)) #221, good
bakun_subs$Year_Month=mo_yr
#8. MODIS Satellite chlorophyll data-----
chl_data=read.table("OriginalFiles/TS_TMWchlaSmdayAverages_x-118.4_y33.55_t20020716120000_T20180716120000.asc.txt", header=T, stringsAsFactors = F, sep="\t")
dim(chl_data) #193 6
head(chl_data) #Time is reported as seconds since 1970-01-01T00Z
#Format time from seconds to date.time
as.POSIXct(chl_data$TIME, origin='1970-01-01 00:00:00') #THAT WORKS!
#Write a fake column of data
chl_data$Date="1970-01-01"
#Write this into a for-loop
for(i in c(1:nrow(chl_data))){
a=as.POSIXct(chl_data$TIME[i], origin='1970-01-01 00:00:00')
b=strsplit(as.character(a), fixed=T, split=" ")
chl_data$Date[i]=b[[1]][1]
}
head(chl_data$Date)
tail(chl_data$Date)
#Split again to create year and month
chl_data$Year_Month="1970_01"
for(i in c(1:nrow(chl_data))){
#print(chl_data$Date[i])
a=strsplit(chl_data$Date[i], fixed=T, split="-")
YEAR=a[[1]][1]
b=a[[1]][2]
#print(b)
if(as.numeric(b)<10){
d=strsplit(b, split="", fixed=T)
MONTH=d[[1]][2]
}else{
MONTH=b
}
chl_data$Year_Month[i]=paste(YEAR, MONTH, sep="_")
}
unique(chl_data$Year_Month)
#9. MODIS SST data-----
sst_data=read.table("OriginalFiles/TS_TMWsstdSmdayAverages_x-118.4_y33.55_t20020716120000_T20180716120000.asc.txt", header=T, stringsAsFactors = F, sep="\t")
head(sst_data)
dim(sst_data) #191 6
names(sst_data)
#Write a fake column for date
sst_data$Date="1970-01-01"
dim(sst_data) #191 7
#Overwrite with actual data
for(i in c(1:nrow(sst_data))){
a=as.POSIXct(sst_data$TIME[i], origin='1970-01-01 00:00:00')
b=strsplit(as.character(a), split=" ")
sst_data$Date[i]=b[[1]][1]
}
head(sst_data$Date)
tail(sst_data$Date)